xref: /titanic_50/usr/src/uts/common/os/rctl.c (revision 14ea4bb737263733ad80a36b4f73f681c30a6b45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/atomic.h>
29 #include <sys/cmn_err.h>
30 #include <sys/id_space.h>
31 #include <sys/kmem.h>
32 #include <sys/kstat.h>
33 #include <sys/log.h>
34 #include <sys/modctl.h>
35 #include <sys/modhash.h>
36 #include <sys/mutex.h>
37 #include <sys/proc.h>
38 #include <sys/procset.h>
39 #include <sys/project.h>
40 #include <sys/resource.h>
41 #include <sys/rctl.h>
42 #include <sys/siginfo.h>
43 #include <sys/strlog.h>
44 #include <sys/systm.h>
45 #include <sys/task.h>
46 #include <sys/types.h>
47 #include <sys/policy.h>
48 #include <sys/zone.h>
49 
50 /*
51  * Resource controls (rctls)
52  *
53  *   The rctl subsystem provides a mechanism for kernel components to
54  *   register their individual resource controls with the system as a whole,
55  *   such that those controls can subscribe to specific actions while being
56  *   associated with the various process-model entities provided by the kernel:
57  *   the process, the task, the project, and the zone.  (In principle, only
58  *   minor modifications would be required to connect the resource control
59  *   functionality to non-process-model entities associated with the system.)
60  *
61  *   Subsystems register their rctls via rctl_register().  Subsystems
62  *   also wishing to provide additional limits on a given rctl can modify
63  *   them once they have the rctl handle.  Each subsystem should store the
64  *   handle to their rctl for direct access.
65  *
66  *   A primary dictionary, rctl_dict, contains a hash of id to the default
67  *   control definition for each controlled resource-entity pair on the system.
68  *   A secondary dictionary, rctl_dict_by_name, contains a hash of name to
69  *   resource control handles.  The resource control handles are distributed by
70  *   the rctl_ids ID space.  The handles are private and not to be
71  *   advertised to userland; all userland interactions are via the rctl
72  *   names.
73  *
74  *   Entities inherit their rctls from their predecessor.  Since projects have
75  *   no ancestor, they inherit their rctls from the rctl dict for project
76  *   rctls.  It is expected that project controls will be set to their
77  *   appropriate values shortly after project creation, presumably from a
78  *   policy source such as the project database.
79  *
80  * Data structures
81  *   The rctl_set_t attached to each of the process model entities is a simple
82  *   hash table keyed on the rctl handle assigned at registration.  The entries
83  *   in the hash table are rctl_t's, whose relationship with the active control
84  *   values on that resource and with the global state of the resource we
85  *   illustrate below:
86  *
87  *   rctl_dict[key] --> rctl_dict_entry
88  *			   ^
89  *			   |
90  *			+--+---+
91  *   rctl_set[key] ---> | rctl | --> value <-> value <-> system value --> NULL
92  *			+--+---+		 ^
93  *			   |			 |
94  *			   +------- cursor ------+
95  *
96  *   That is, the rctl contains a back pointer to the global resource control
97  *   state for this resource, which is also available in the rctl_dict hash
98  *   table mentioned earlier.  The rctl contains two pointers to resource
99  *   control values:  one, values, indicates the entire sequence of control
100  *   values; the other, cursor, indicates the currently active control
101  *   value--the next value to be enforced.  The value list itself is an open,
102  *   doubly-linked list, the last non-NULL member of which is the system value
103  *   for that resource (being the theoretical/conventional maximum allowable
104  *   value for the resource on this OS instance).
105  *
106  * Ops Vector
107  *   Subsystems publishing rctls need not provide instances of all of the
108  *   functions specified by the ops vector.  In particular, if general
109  *   rctl_*() entry points are not being called, certain functions can be
110  *   omitted.  These align as follows:
111  *
112  *   rctl_set()
113  *     You may wish to provide a set callback if locking circumstances prevent
114  *     it or if the performance cost of requesting the enforced value from the
115  *     resource control is prohibitively expensive.  For instance, the currently
116  *     enforced file size limit is stored on the process in the p_fsz_ctl to
117  *     maintain read()/write() performance.
118  *
119  *   rctl_test()
120  *     You must provide a test callback if you are using the rctl_test()
121  *     interface.  An action callback is optional.
122  *
123  *   rctl_action()
124  *     You may wish to provide an action callback.
125  *
126  * Registration
127  *   New resource controls can be added to a running instance by loaded modules
128  *   via registration.  (The current implementation does not support unloadable
129  *   modules; this functionality can be added if needed, via an
130  *   activation/deactivation interface involving the manipulation of the
131  *   ops vector for the resource control(s) needing to support unloading.)
132  *
133  * Control value ordering
134  *   Because the rctl_val chain on each rctl must be navigable in a
135  *   deterministic way, we have to define an ordering on the rctl_val_t's.  The
136  *   defined order is (flags & [maximal], value, flags & [deny-action],
137  *   privilege).
138  *
139  * Locking
140  *   rctl_dict_lock must be acquired prior to rctl_lists_lock.  Since
141  *   rctl_dict_lock or rctl_lists_lock can be called at the enforcement point
142  *   of any subsystem, holding subsystem locks, it is at all times inappropriate
143  *   to call kmem_alloc(., KM_SLEEP) while holding either of these locks.
144  *   Traversing any of the various resource control entity lists requires
145  *   holding rctl_lists_lock.
146  *
147  *   Each individual resource control set associated with an entity must have
148  *   its rcs_lock held for the duration of any operations that would add
149  *   resource controls or control values to the set.
150  *
151  *   The locking subsequence of interest is: p_lock, rctl_dict_lock,
152  *   rctl_lists_lock, entity->rcs_lock.
153  */
154 
155 id_t max_rctl_hndl = 32768;
156 int rctl_dict_size = 64;
157 int rctl_set_size = 8;
158 kmutex_t rctl_dict_lock;
159 mod_hash_t *rctl_dict;
160 mod_hash_t *rctl_dict_by_name;
161 id_space_t *rctl_ids;
162 kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
163 kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
164 
165 kmutex_t rctl_lists_lock;
166 rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
167 
168 /*
169  * Default resource control operations and ops vector
170  *   To be used if the particular rcontrol has no specific actions defined, or
171  *   if the subsystem providing the control is quiescing (in preparation for
172  *   unloading, presumably.)
173  *
174  *   Resource controls with callbacks should fill the unused operations with the
175  *   appropriate default impotent callback.
176  */
177 /*ARGSUSED*/
178 void
179 rcop_no_action(struct rctl *r, struct proc *p, rctl_entity_p_t *e)
180 {
181 }
182 
183 /*ARGSUSED*/
184 rctl_qty_t
185 rcop_no_usage(struct rctl *r, struct proc *p)
186 {
187 	return (0);
188 }
189 
190 /*ARGSUSED*/
191 int
192 rcop_no_set(struct rctl *r, struct proc *p, rctl_entity_p_t *e, rctl_qty_t l)
193 {
194 	return (0);
195 }
196 
197 /*ARGSUSED*/
198 int
199 rcop_no_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
200     struct rctl_val *rv, rctl_qty_t i, uint_t f)
201 {
202 	return (0);
203 }
204 
205 rctl_ops_t rctl_default_ops = {
206 	rcop_no_action,
207 	rcop_no_usage,
208 	rcop_no_set,
209 	rcop_no_test
210 };
211 
212 /*
213  * Default "absolute" resource control operation and ops vector
214  *   Useful if there is no usage associated with the
215  *   resource control.
216  */
217 /*ARGSUSED*/
218 int
219 rcop_absolute_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
220     struct rctl_val *rv, rctl_qty_t i, uint_t f)
221 {
222 	return (i > rv->rcv_value);
223 }
224 
225 rctl_ops_t rctl_absolute_ops = {
226 	rcop_no_action,
227 	rcop_no_usage,
228 	rcop_no_set,
229 	rcop_absolute_test
230 };
231 
232 /*ARGSUSED*/
233 static uint_t
234 rctl_dict_hash_by_id(void *hash_data, mod_hash_key_t key)
235 {
236 	return ((uint_t)(uintptr_t)key % rctl_dict_size);
237 }
238 
239 static int
240 rctl_dict_id_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
241 {
242 	uint_t u1 = (uint_t)(uintptr_t)key1;
243 	uint_t u2 = (uint_t)(uintptr_t)key2;
244 
245 	if (u1 > u2)
246 		return (1);
247 
248 	if (u1 == u2)
249 		return (0);
250 
251 	return (-1);
252 }
253 
254 static void
255 rctl_dict_val_dtor(mod_hash_val_t val)
256 {
257 	rctl_dict_entry_t *kr = (rctl_dict_entry_t *)val;
258 
259 	kmem_free(kr, sizeof (rctl_dict_entry_t));
260 }
261 
262 /*
263  * size_t rctl_build_name_buf()
264  *
265  * Overview
266  *   rctl_build_name_buf() walks all active resource controls in the dictionary,
267  *   building a buffer of continguous NUL-terminated strings.
268  *
269  * Return values
270  *   The size of the buffer is returned, the passed pointer's contents are
271  *   modified to that of the location of the buffer.
272  *
273  * Caller's context
274  *   Caller must be in a context suitable for KM_SLEEP allocations.
275  */
276 size_t
277 rctl_build_name_buf(char **rbufp)
278 {
279 	size_t req_size, cpy_size;
280 	char *rbufloc;
281 	int i;
282 
283 rctl_rebuild_name_buf:
284 	req_size = cpy_size = 0;
285 
286 	/*
287 	 * Calculate needed buffer length.
288 	 */
289 	mutex_enter(&rctl_lists_lock);
290 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
291 		rctl_dict_entry_t *rde;
292 
293 		for (rde = rctl_lists[i];
294 		    rde != NULL;
295 		    rde = rde->rcd_next)
296 			req_size += strlen(rde->rcd_name) + 1;
297 	}
298 	mutex_exit(&rctl_lists_lock);
299 
300 	rbufloc = *rbufp = kmem_alloc(req_size, KM_SLEEP);
301 
302 	/*
303 	 * Copy rctl names into our buffer.  If the copy length exceeds the
304 	 * allocate length (due to registration changes), stop copying, free the
305 	 * buffer, and start again.
306 	 */
307 	mutex_enter(&rctl_lists_lock);
308 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
309 		rctl_dict_entry_t *rde;
310 
311 		for (rde = rctl_lists[i];
312 		    rde != NULL;
313 		    rde = rde->rcd_next) {
314 			size_t length = strlen(rde->rcd_name) + 1;
315 
316 			cpy_size += length;
317 
318 			if (cpy_size > req_size) {
319 				kmem_free(*rbufp, req_size);
320 				mutex_exit(&rctl_lists_lock);
321 				goto rctl_rebuild_name_buf;
322 			}
323 
324 			bcopy(rde->rcd_name, rbufloc, length);
325 			rbufloc += length;
326 		}
327 	}
328 	mutex_exit(&rctl_lists_lock);
329 
330 	return (req_size);
331 }
332 
333 /*
334  * rctl_dict_entry_t *rctl_dict_lookup(const char *)
335  *
336  * Overview
337  *   rctl_dict_lookup() returns the resource control dictionary entry for the
338  *   named resource control.
339  *
340  * Return values
341  *   A pointer to the appropriate resource control dictionary entry, or NULL if
342  *   no such named entry exists.
343  *
344  * Caller's context
345  *   Caller must not be holding rctl_dict_lock.
346  */
347 rctl_dict_entry_t *
348 rctl_dict_lookup(const char *name)
349 {
350 	rctl_dict_entry_t *rde;
351 
352 	mutex_enter(&rctl_dict_lock);
353 
354 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
355 	    (mod_hash_val_t *)&rde) == MH_ERR_NOTFOUND) {
356 		mutex_exit(&rctl_dict_lock);
357 		return (NULL);
358 	}
359 
360 	mutex_exit(&rctl_dict_lock);
361 
362 	return (rde);
363 }
364 
365 /*
366  * rctl_hndl_t rctl_hndl_lookup(const char *)
367  *
368  * Overview
369  *   rctl_hndl_lookup() returns the resource control id (the "handle") for the
370  *   named resource control.
371  *
372  * Return values
373  *   The appropriate id, or -1 if no such named entry exists.
374  *
375  * Caller's context
376  *   Caller must not be holding rctl_dict_lock.
377  */
378 rctl_hndl_t
379 rctl_hndl_lookup(const char *name)
380 {
381 	rctl_dict_entry_t *rde;
382 
383 	if ((rde = rctl_dict_lookup(name)) == NULL)
384 		return (-1);
385 
386 	return (rde->rcd_id);
387 }
388 
389 /*
390  * rctl_dict_entry_t * rctl_dict_lookup_hndl(rctl_hndl_t)
391  *
392  * Overview
393  *   rctl_dict_lookup_hndl() completes the public lookup functions, by returning
394  *   the resource control dictionary entry matching a given resource control id.
395  *
396  * Return values
397  *   A pointer to the matching resource control dictionary entry, or NULL if the
398  *   id does not match any existing entries.
399  *
400  * Caller's context
401  *   Caller must not be holding rctl_lists_lock.
402  */
403 rctl_dict_entry_t *
404 rctl_dict_lookup_hndl(rctl_hndl_t hndl)
405 {
406 	uint_t i;
407 
408 	mutex_enter(&rctl_lists_lock);
409 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
410 		rctl_dict_entry_t *rde;
411 
412 		for (rde = rctl_lists[i];
413 		    rde != NULL;
414 		    rde = rde->rcd_next)
415 			if (rde->rcd_id == hndl) {
416 				mutex_exit(&rctl_lists_lock);
417 				return (rde);
418 			}
419 	}
420 	mutex_exit(&rctl_lists_lock);
421 
422 	return (NULL);
423 }
424 
425 /*
426  * void rctl_add_default_limit(const char *name, rctl_qty_t value,
427  *     rctl_priv_t privilege, uint_t action)
428  *
429  * Overview
430  *   Create a default limit with specified value, privilege, and action.
431  *
432  * Return value
433  *   No value returned.
434  */
435 void
436 rctl_add_default_limit(const char *name, rctl_qty_t value,
437     rctl_priv_t privilege, uint_t action)
438 {
439 	rctl_val_t *dval;
440 	rctl_dict_entry_t *rde;
441 
442 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
443 	bzero(dval, sizeof (rctl_val_t));
444 	dval->rcv_value = value;
445 	dval->rcv_privilege = privilege;
446 	dval->rcv_flagaction = action;
447 	dval->rcv_action_recip_pid = -1;
448 
449 	rde = rctl_dict_lookup(name);
450 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
451 }
452 
453 /*
454  * void rctl_add_legacy_limit(const char *name, const char *mname,
455  *     const char *lname, rctl_qty_t dflt)
456  *
457  * Overview
458  *   Create a default privileged limit, using the value obtained from
459  *   /etc/system if it exists and is greater than the specified default
460  *   value.  Exists primarily for System V IPC.
461  *
462  * Return value
463  *   No value returned.
464  */
465 void
466 rctl_add_legacy_limit(const char *name, const char *mname, const char *lname,
467     rctl_qty_t dflt, rctl_qty_t max)
468 {
469 	rctl_qty_t qty;
470 
471 	if (!mod_sysvar(mname, lname, &qty) || (qty < dflt))
472 		qty = dflt;
473 
474 	if (qty > max)
475 		qty = max;
476 
477 	rctl_add_default_limit(name, qty, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
478 }
479 
480 static rctl_set_t *
481 rctl_entity_obtain_rset(rctl_dict_entry_t *rcd, struct proc *p)
482 {
483 	rctl_set_t *rset = NULL;
484 
485 	if (rcd == NULL)
486 		return (NULL);
487 
488 	switch (rcd->rcd_entity) {
489 	case RCENTITY_PROCESS:
490 		rset = p->p_rctls;
491 		break;
492 	case RCENTITY_TASK:
493 		ASSERT(MUTEX_HELD(&p->p_lock));
494 		if (p->p_task != NULL)
495 			rset = p->p_task->tk_rctls;
496 		break;
497 	case RCENTITY_PROJECT:
498 		ASSERT(MUTEX_HELD(&p->p_lock));
499 		if (p->p_task != NULL &&
500 		    p->p_task->tk_proj != NULL)
501 			rset = p->p_task->tk_proj->kpj_rctls;
502 		break;
503 	case RCENTITY_ZONE:
504 		ASSERT(MUTEX_HELD(&p->p_lock));
505 		if (p->p_zone != NULL)
506 			rset = p->p_zone->zone_rctls;
507 		break;
508 	default:
509 		panic("unknown rctl entity type %d seen", rcd->rcd_entity);
510 		break;
511 	}
512 
513 	return (rset);
514 }
515 
516 static void
517 rctl_entity_obtain_entity_p(rctl_entity_t entity, struct proc *p,
518     rctl_entity_p_t *e)
519 {
520 	e->rcep_p.proc = NULL;
521 	e->rcep_t = entity;
522 
523 	switch (entity) {
524 	case RCENTITY_PROCESS:
525 		e->rcep_p.proc = p;
526 		break;
527 	case RCENTITY_TASK:
528 		ASSERT(MUTEX_HELD(&p->p_lock));
529 		if (p->p_task != NULL)
530 			e->rcep_p.task = p->p_task;
531 		break;
532 	case RCENTITY_PROJECT:
533 		ASSERT(MUTEX_HELD(&p->p_lock));
534 		if (p->p_task != NULL &&
535 		    p->p_task->tk_proj != NULL)
536 			e->rcep_p.proj = p->p_task->tk_proj;
537 		break;
538 	case RCENTITY_ZONE:
539 		ASSERT(MUTEX_HELD(&p->p_lock));
540 		if (p->p_zone != NULL)
541 			e->rcep_p.zone = p->p_zone;
542 		break;
543 	default:
544 		panic("unknown rctl entity type %d seen", entity);
545 		break;
546 	}
547 }
548 
549 static void
550 rctl_gp_alloc(rctl_alloc_gp_t *rcgp)
551 {
552 	uint_t i;
553 
554 	if (rcgp->rcag_nctls > 0) {
555 		rctl_t *prev = kmem_cache_alloc(rctl_cache, KM_SLEEP);
556 		rctl_t *rctl = prev;
557 
558 		rcgp->rcag_ctls = prev;
559 
560 		for (i = 1; i < rcgp->rcag_nctls; i++) {
561 			rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
562 			prev->rc_next = rctl;
563 			prev = rctl;
564 		}
565 
566 		rctl->rc_next = NULL;
567 	}
568 
569 	if (rcgp->rcag_nvals > 0) {
570 		rctl_val_t *prev = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
571 		rctl_val_t *rval = prev;
572 
573 		rcgp->rcag_vals = prev;
574 
575 		for (i = 1; i < rcgp->rcag_nvals; i++) {
576 			rval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
577 			prev->rcv_next = rval;
578 			prev = rval;
579 		}
580 
581 		rval->rcv_next = NULL;
582 	}
583 
584 }
585 
586 static rctl_val_t *
587 rctl_gp_detach_val(rctl_alloc_gp_t *rcgp)
588 {
589 	rctl_val_t *rval = rcgp->rcag_vals;
590 
591 	ASSERT(rcgp->rcag_nvals > 0);
592 	rcgp->rcag_nvals--;
593 	rcgp->rcag_vals = rval->rcv_next;
594 
595 	rval->rcv_next = NULL;
596 
597 	return (rval);
598 }
599 
600 static rctl_t *
601 rctl_gp_detach_ctl(rctl_alloc_gp_t *rcgp)
602 {
603 	rctl_t *rctl = rcgp->rcag_ctls;
604 
605 	ASSERT(rcgp->rcag_nctls > 0);
606 	rcgp->rcag_nctls--;
607 	rcgp->rcag_ctls = rctl->rc_next;
608 
609 	rctl->rc_next = NULL;
610 
611 	return (rctl);
612 
613 }
614 
615 static void
616 rctl_gp_free(rctl_alloc_gp_t *rcgp)
617 {
618 	rctl_val_t *rval = rcgp->rcag_vals;
619 	rctl_t *rctl = rcgp->rcag_ctls;
620 
621 	while (rval != NULL) {
622 		rctl_val_t *next = rval->rcv_next;
623 
624 		kmem_cache_free(rctl_val_cache, rval);
625 		rval = next;
626 	}
627 
628 	while (rctl != NULL) {
629 		rctl_t *next = rctl->rc_next;
630 
631 		kmem_cache_free(rctl_cache, rctl);
632 		rctl = next;
633 	}
634 }
635 
636 /*
637  * void rctl_prealloc_destroy(rctl_alloc_gp_t *)
638  *
639  * Overview
640  *   Release all unused memory allocated via one of the "prealloc" functions:
641  *   rctl_set_init_prealloc, rctl_set_dup_prealloc, or rctl_rlimit_set_prealloc.
642  *
643  * Return values
644  *   None.
645  *
646  * Caller's context
647  *   No restrictions on context.
648  */
649 void
650 rctl_prealloc_destroy(rctl_alloc_gp_t *gp)
651 {
652 	rctl_gp_free(gp);
653 	kmem_free(gp, sizeof (rctl_alloc_gp_t));
654 }
655 
656 /*
657  * int rctl_val_cmp(rctl_val_t *, rctl_val_t *, int)
658  *
659  * Overview
660  *   This function defines an ordering to rctl_val_t's in order to allow
661  *   for correct placement in value lists. When the imprecise flag is set,
662  *   the action recipient is ignored. This is to facilitate insert,
663  *   delete, and replace operations by rctlsys.
664  *
665  * Return values
666  *   0 if the val_t's are are considered identical
667  *   -1 if a is ordered lower than b
668  *   1 if a is lowered higher than b
669  *
670  * Caller's context
671  *   No restrictions on context.
672  */
673 int
674 rctl_val_cmp(rctl_val_t *a, rctl_val_t *b, int imprecise)
675 {
676 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) <
677 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
678 		return (-1);
679 
680 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) >
681 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
682 		return (1);
683 
684 	if (a->rcv_value < b->rcv_value)
685 		return (-1);
686 
687 	if (a->rcv_value > b->rcv_value)
688 		return (1);
689 
690 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) <
691 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
692 		return (-1);
693 
694 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) >
695 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
696 		return (1);
697 
698 	if (a->rcv_privilege < b->rcv_privilege)
699 		return (-1);
700 
701 	if (a->rcv_privilege > b->rcv_privilege)
702 		return (1);
703 
704 	if (imprecise)
705 		return (0);
706 
707 	if (a->rcv_action_recip_pid < b->rcv_action_recip_pid)
708 		return (-1);
709 
710 	if (a->rcv_action_recip_pid > b->rcv_action_recip_pid)
711 		return (1);
712 
713 	return (0);
714 }
715 
716 static rctl_val_t *
717 rctl_val_list_find(rctl_val_t **head, rctl_val_t *cval)
718 {
719 	rctl_val_t *rval = *head;
720 
721 	while (rval != NULL) {
722 		if (rctl_val_cmp(cval, rval, 0) == 0)
723 			return (rval);
724 
725 		rval = rval->rcv_next;
726 	}
727 
728 	return (NULL);
729 
730 }
731 
732 /*
733  * int rctl_val_list_insert(rctl_val_t **, rctl_val_t *)
734  *
735  * Overview
736  *   This function inserts the rctl_val_t into the value list provided.
737  *   The insert is always successful unless if the value is a duplicate
738  *   of one already in the list.
739  *
740  * Return values
741  *    1 if the value was a duplicate of an existing value in the list.
742  *    0 if the insert was successful.
743  */
744 int
745 rctl_val_list_insert(rctl_val_t **root, rctl_val_t *rval)
746 {
747 	rctl_val_t *prev;
748 	int equiv;
749 
750 	rval->rcv_next = NULL;
751 	rval->rcv_prev = NULL;
752 
753 	if (*root == NULL) {
754 		*root = rval;
755 		return (0);
756 	}
757 
758 	equiv = rctl_val_cmp(rval, *root, 0);
759 
760 	if (equiv == 0)
761 		return (1);
762 
763 	if (equiv < 0) {
764 		rval->rcv_next = *root;
765 		rval->rcv_next->rcv_prev = rval;
766 		*root = rval;
767 
768 		return (0);
769 	}
770 
771 	prev = *root;
772 	while (prev->rcv_next != NULL &&
773 	    (equiv = rctl_val_cmp(rval, prev->rcv_next, 0)) > 0) {
774 		prev = prev->rcv_next;
775 	}
776 
777 	if (equiv == 0)
778 		return (1);
779 
780 	rval->rcv_next = prev->rcv_next;
781 	if (rval->rcv_next != NULL)
782 		rval->rcv_next->rcv_prev = rval;
783 	prev->rcv_next = rval;
784 	rval->rcv_prev = prev;
785 
786 	return (0);
787 }
788 
789 static int
790 rctl_val_list_delete(rctl_val_t **root, rctl_val_t *rval)
791 {
792 	rctl_val_t *prev;
793 
794 	if (*root == NULL)
795 		return (-1);
796 
797 	prev = *root;
798 	if (rctl_val_cmp(rval, prev, 0) == 0) {
799 		*root = prev->rcv_next;
800 		(*root)->rcv_prev = NULL;
801 
802 		kmem_cache_free(rctl_val_cache, prev);
803 
804 		return (0);
805 	}
806 
807 	while (prev->rcv_next != NULL &&
808 	    rctl_val_cmp(rval, prev->rcv_next, 0) != 0) {
809 		prev = prev->rcv_next;
810 	}
811 
812 	if (prev->rcv_next == NULL) {
813 		/*
814 		 * If we navigate the entire list and cannot find a match, then
815 		 * return failure.
816 		 */
817 		return (-1);
818 	}
819 
820 	prev = prev->rcv_next;
821 	prev->rcv_prev->rcv_next = prev->rcv_next;
822 	if (prev->rcv_next != NULL)
823 		prev->rcv_next->rcv_prev = prev->rcv_prev;
824 
825 	kmem_cache_free(rctl_val_cache, prev);
826 
827 	return (0);
828 }
829 
830 static rctl_val_t *
831 rctl_val_list_dup(rctl_val_t *rval, rctl_alloc_gp_t *ragp, struct proc *oldp,
832     struct proc *newp)
833 {
834 	rctl_val_t *head = NULL;
835 
836 	for (; rval != NULL; rval = rval->rcv_next) {
837 		rctl_val_t *dval = rctl_gp_detach_val(ragp);
838 
839 		bcopy(rval, dval, sizeof (rctl_val_t));
840 		dval->rcv_prev = dval->rcv_next = NULL;
841 
842 		if (oldp == NULL ||
843 		    rval->rcv_action_recipient == NULL ||
844 		    rval->rcv_action_recipient == oldp) {
845 			if (rval->rcv_privilege == RCPRIV_BASIC) {
846 				dval->rcv_action_recipient = newp;
847 				dval->rcv_action_recip_pid = newp->p_pid;
848 			} else {
849 				dval->rcv_action_recipient = NULL;
850 				dval->rcv_action_recip_pid = -1;
851 			}
852 
853 			(void) rctl_val_list_insert(&head, dval);
854 		} else {
855 			kmem_cache_free(rctl_val_cache, dval);
856 		}
857 	}
858 
859 	return (head);
860 }
861 
862 static void
863 rctl_val_list_reset(rctl_val_t *rval)
864 {
865 	for (; rval != NULL; rval = rval->rcv_next)
866 		rval->rcv_firing_time = 0;
867 }
868 
869 static uint_t
870 rctl_val_list_count(rctl_val_t *rval)
871 {
872 	uint_t n = 0;
873 
874 	for (; rval != NULL; rval = rval->rcv_next)
875 		n++;
876 
877 	return (n);
878 }
879 
880 
881 static void
882 rctl_val_list_free(rctl_val_t *rval)
883 {
884 	while (rval != NULL) {
885 		rctl_val_t *next = rval->rcv_next;
886 
887 		kmem_cache_free(rctl_val_cache, rval);
888 
889 		rval = next;
890 	}
891 }
892 
893 /*
894  * rctl_qty_t rctl_model_maximum(rctl_dict_entry_t *, struct proc *)
895  *
896  * Overview
897  *   In cases where the operating system supports more than one process
898  *   addressing model, the operating system capabilities will exceed those of
899  *   one or more of these models.  Processes in a less capable model must have
900  *   their resources accurately controlled, without diluting those of their
901  *   descendants reached via exec().  rctl_model_maximum() returns the governing
902  *   value for the specified process with respect to a resource control, such
903  *   that the value can used for the RCTLOP_SET callback or compatability
904  *   support.
905  *
906  * Return values
907  *   The maximum value for the given process for the specified resource control.
908  *
909  * Caller's context
910  *   No restrictions on context.
911  */
912 rctl_qty_t
913 rctl_model_maximum(rctl_dict_entry_t *rde, struct proc *p)
914 {
915 	if (p->p_model == DATAMODEL_NATIVE)
916 		return (rde->rcd_max_native);
917 
918 	return (rde->rcd_max_ilp32);
919 }
920 
921 /*
922  * rctl_qty_t rctl_model_value(rctl_dict_entry_t *, struct proc *, rctl_qty_t)
923  *
924  * Overview
925  *   Convenience function wrapping the rctl_model_maximum() functionality.
926  *
927  * Return values
928  *   The lesser of the process's maximum value and the given value for the
929  *   specified resource control.
930  *
931  * Caller's context
932  *   No restrictions on context.
933  */
934 rctl_qty_t
935 rctl_model_value(rctl_dict_entry_t *rde, struct proc *p, rctl_qty_t value)
936 {
937 	rctl_qty_t max = rctl_model_maximum(rde, p);
938 
939 	return (value < max ? value : max);
940 }
941 
942 static void
943 rctl_set_insert(rctl_set_t *set, rctl_hndl_t hndl, rctl_t *rctl)
944 {
945 	uint_t index = hndl % rctl_set_size;
946 	rctl_t *next_ctl, *prev_ctl;
947 
948 	ASSERT(MUTEX_HELD(&set->rcs_lock));
949 
950 	rctl->rc_next = NULL;
951 
952 	if (set->rcs_ctls[index] == NULL) {
953 		set->rcs_ctls[index] = rctl;
954 		return;
955 	}
956 
957 	if (hndl < set->rcs_ctls[index]->rc_id) {
958 		rctl->rc_next = set->rcs_ctls[index];
959 		set->rcs_ctls[index] = rctl;
960 
961 		return;
962 	}
963 
964 	for (next_ctl = set->rcs_ctls[index]->rc_next,
965 	    prev_ctl = set->rcs_ctls[index];
966 	    next_ctl != NULL;
967 	    prev_ctl = next_ctl,
968 	    next_ctl = next_ctl->rc_next) {
969 		if (next_ctl->rc_id > hndl) {
970 			rctl->rc_next = next_ctl;
971 			prev_ctl->rc_next = rctl;
972 
973 			return;
974 		}
975 	}
976 
977 	rctl->rc_next = next_ctl;
978 	prev_ctl->rc_next = rctl;
979 }
980 
981 /*
982  * rctl_set_t *rctl_set_create()
983  *
984  * Overview
985  *   Create an empty resource control set, suitable for attaching to a
986  *   controlled entity.
987  *
988  * Return values
989  *   A pointer to the newly created set.
990  *
991  * Caller's context
992  *   Safe for KM_SLEEP allocations.
993  */
994 rctl_set_t *
995 rctl_set_create()
996 {
997 	rctl_set_t *rset = kmem_zalloc(sizeof (rctl_set_t), KM_SLEEP);
998 
999 	mutex_init(&rset->rcs_lock, NULL, MUTEX_DEFAULT, NULL);
1000 	rset->rcs_ctls = kmem_zalloc(rctl_set_size * sizeof (rctl_t *),
1001 	    KM_SLEEP);
1002 	rset->rcs_entity = -1;
1003 
1004 	return (rset);
1005 }
1006 
1007 /*
1008  * rctl_gp_alloc_t *rctl_set_init_prealloc(rctl_entity_t)
1009  *
1010  * Overview
1011  *    rctl_set_init_prealloc() examines the globally defined resource controls
1012  *    and their default values and returns a resource control allocation group
1013  *    populated with sufficient controls and values to form a representative
1014  *    resource control set for the specified entity.
1015  *
1016  * Return values
1017  *    A pointer to the newly created allocation group.
1018  *
1019  * Caller's context
1020  *    Caller must be in a context suitable for KM_SLEEP allocations.
1021  */
1022 rctl_alloc_gp_t *
1023 rctl_set_init_prealloc(rctl_entity_t entity)
1024 {
1025 	rctl_dict_entry_t *rde;
1026 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1027 
1028 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1029 
1030 	if (rctl_lists[entity] == NULL)
1031 		return (ragp);
1032 
1033 	mutex_enter(&rctl_lists_lock);
1034 
1035 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1036 		ragp->rcag_nctls++;
1037 		ragp->rcag_nvals += rctl_val_list_count(rde->rcd_default_value);
1038 	}
1039 
1040 	mutex_exit(&rctl_lists_lock);
1041 
1042 	rctl_gp_alloc(ragp);
1043 
1044 	return (ragp);
1045 }
1046 
1047 /*
1048  * rctl_set_t *rctl_set_init(rctl_entity_t)
1049  *
1050  * Overview
1051  *   rctl_set_create() creates a resource control set, initialized with the
1052  *   system infinite values on all registered controls, for attachment to a
1053  *   system entity requiring resource controls, such as a process or a task.
1054  *
1055  * Return values
1056  *   A pointer to the newly filled set.
1057  *
1058  * Caller's context
1059  *   Caller must be holding p_lock on entry so that RCTLOP_SET() functions
1060  *   may modify task and project members based on the proc structure
1061  *   they are passed.
1062  */
1063 rctl_set_t *
1064 rctl_set_init(rctl_entity_t entity, struct proc *p, rctl_entity_p_t *e,
1065     rctl_set_t *rset, rctl_alloc_gp_t *ragp)
1066 {
1067 	rctl_dict_entry_t *rde;
1068 
1069 	ASSERT(MUTEX_HELD(&p->p_lock));
1070 	ASSERT(e);
1071 	rset->rcs_entity = entity;
1072 
1073 	if (rctl_lists[entity] == NULL)
1074 		return (rset);
1075 
1076 	mutex_enter(&rctl_lists_lock);
1077 	mutex_enter(&rset->rcs_lock);
1078 
1079 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1080 		rctl_t *rctl = rctl_gp_detach_ctl(ragp);
1081 
1082 		rctl->rc_dict_entry = rde;
1083 		rctl->rc_id = rde->rcd_id;
1084 
1085 		rctl->rc_values = rctl_val_list_dup(rde->rcd_default_value,
1086 		    ragp, NULL, p);
1087 		rctl->rc_cursor = rctl->rc_values;
1088 
1089 		ASSERT(rctl->rc_cursor != NULL);
1090 
1091 		rctl_set_insert(rset, rde->rcd_id, rctl);
1092 
1093 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1094 		    rctl->rc_cursor->rcv_value));
1095 	}
1096 
1097 	mutex_exit(&rset->rcs_lock);
1098 	mutex_exit(&rctl_lists_lock);
1099 
1100 	return (rset);
1101 }
1102 
1103 static rctl_t *
1104 rctl_dup(rctl_t *rctl, rctl_alloc_gp_t *ragp, struct proc *oldp,
1105     struct proc *newp)
1106 {
1107 	rctl_t *dup = rctl_gp_detach_ctl(ragp);
1108 	rctl_val_t *dval;
1109 
1110 	dup->rc_id = rctl->rc_id;
1111 	dup->rc_dict_entry = rctl->rc_dict_entry;
1112 	dup->rc_next = NULL;
1113 	dup->rc_cursor = NULL;
1114 	dup->rc_values = rctl_val_list_dup(rctl->rc_values, ragp, oldp, newp);
1115 
1116 	for (dval = dup->rc_values;
1117 	    dval != NULL; dval = dval->rcv_next) {
1118 		if (rctl_val_cmp(rctl->rc_cursor, dval, 0) >= 0) {
1119 			dup->rc_cursor = dval;
1120 			break;
1121 		}
1122 	}
1123 
1124 	if (dup->rc_cursor == NULL)
1125 		dup->rc_cursor = dup->rc_values;
1126 
1127 	return (dup);
1128 }
1129 
1130 static void
1131 rctl_set_fill_alloc_gp(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1132 {
1133 	uint_t i;
1134 
1135 	bzero(ragp, sizeof (rctl_alloc_gp_t));
1136 
1137 	for (i = 0; i < rctl_set_size; i++) {
1138 		rctl_t *r = set->rcs_ctls[i];
1139 
1140 		while (r != NULL) {
1141 			ragp->rcag_nctls++;
1142 
1143 			ragp->rcag_nvals += rctl_val_list_count(r->rc_values);
1144 
1145 			r = r->rc_next;
1146 		}
1147 	}
1148 }
1149 
1150 /*
1151  * rctl_alloc_gp_t *rctl_set_dup_prealloc(rctl_set_t *)
1152  *
1153  * Overview
1154  *   Given a resource control set, allocate a sufficiently large allocation
1155  *   group to contain a duplicate of the set.
1156  *
1157  * Return value
1158  *   A pointer to the newly created allocation group.
1159  *
1160  * Caller's context
1161  *   Safe for KM_SLEEP allocations.
1162  */
1163 rctl_alloc_gp_t *
1164 rctl_set_dup_prealloc(rctl_set_t *set)
1165 {
1166 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1167 
1168 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1169 
1170 	mutex_enter(&set->rcs_lock);
1171 	rctl_set_fill_alloc_gp(set, ragp);
1172 	mutex_exit(&set->rcs_lock);
1173 
1174 	rctl_gp_alloc(ragp);
1175 
1176 	return (ragp);
1177 }
1178 
1179 /*
1180  * int rctl_set_dup_ready(rctl_set_t *, rctl_alloc_gp_t *)
1181  *
1182  * Overview
1183  *   Verify that the allocation group provided is large enough to allow a
1184  *   duplicate of the given resource control set to be constructed from its
1185  *   contents.
1186  *
1187  * Return values
1188  *   1 if the allocation group is sufficiently large, 0 otherwise.
1189  *
1190  * Caller's context
1191  *   rcs_lock must be held prior to entry.
1192  */
1193 int
1194 rctl_set_dup_ready(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1195 {
1196 	rctl_alloc_gp_t curr_gp;
1197 
1198 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1199 
1200 	rctl_set_fill_alloc_gp(set, &curr_gp);
1201 
1202 	if (curr_gp.rcag_nctls <= ragp->rcag_nctls &&
1203 	    curr_gp.rcag_nvals <= ragp->rcag_nvals)
1204 		return (1);
1205 
1206 	return (0);
1207 }
1208 
1209 /*
1210  * rctl_set_t *rctl_set_dup(rctl_set_t *, struct proc *, struct proc *,
1211  *   rctl_set_t *, rctl_alloc_gp_t *, int)
1212  *
1213  * Overview
1214  *   Make a duplicate of the resource control set.  The proc pointers are those
1215  *   of the owning process and of the process associated with the entity
1216  *   receiving the duplicate.
1217  *
1218  *   Duplication is a 3 stage process. Stage 1 is memory allocation for
1219  *   the duplicate set, which is taken care of by rctl_set_dup_prealloc().
1220  *   Stage 2 consists of copying all rctls and values from the old set into
1221  *   the new. Stage 3 completes the duplication by performing the appropriate
1222  *   callbacks for each rctl in the new set.
1223  *
1224  *   Stages 2 and 3 are handled by calling rctl_set_dup with the RCD_DUP and
1225  *   RCD_CALLBACK functions, respectively. The RCD_CALLBACK flag may only
1226  *   be supplied if the newp proc structure reflects the new task and
1227  *   project linkage.
1228  *
1229  * Return value
1230  *   A pointer to the duplicate set.
1231  *
1232  * Caller's context
1233  *   The rcs_lock of the set to be duplicated must be held prior to entry.
1234  */
1235 rctl_set_t *
1236 rctl_set_dup(rctl_set_t *set, struct proc *oldp, struct proc *newp,
1237     rctl_entity_p_t *e, rctl_set_t *dup, rctl_alloc_gp_t *ragp, int flag)
1238 {
1239 	uint_t i;
1240 	rctl_set_t	*iter;
1241 
1242 	ASSERT((flag & RCD_DUP) || (flag & RCD_CALLBACK));
1243 	ASSERT(e);
1244 	/*
1245 	 * When copying the old set, iterate over that. Otherwise, when
1246 	 * only callbacks have been requested, iterate over the dup set.
1247 	 */
1248 	if (flag & RCD_DUP) {
1249 		ASSERT(MUTEX_HELD(&set->rcs_lock));
1250 		iter = set;
1251 		dup->rcs_entity = set->rcs_entity;
1252 	} else {
1253 		iter = dup;
1254 	}
1255 
1256 	mutex_enter(&dup->rcs_lock);
1257 
1258 	for (i = 0; i < rctl_set_size; i++) {
1259 		rctl_t *r = iter->rcs_ctls[i];
1260 		rctl_t *d;
1261 
1262 		while (r != NULL) {
1263 			if (flag & RCD_DUP) {
1264 				d = rctl_dup(r, ragp, oldp, newp);
1265 				rctl_set_insert(dup, r->rc_id, d);
1266 			} else {
1267 				d = r;
1268 			}
1269 
1270 			if (flag & RCD_CALLBACK)
1271 				RCTLOP_SET(d, newp, e,
1272 				    rctl_model_value(d->rc_dict_entry, newp,
1273 				    d->rc_cursor->rcv_value));
1274 
1275 			r = r->rc_next;
1276 		}
1277 	}
1278 
1279 	mutex_exit(&dup->rcs_lock);
1280 
1281 	return (dup);
1282 }
1283 
1284 /*
1285  * void rctl_set_free(rctl_set_t *)
1286  *
1287  * Overview
1288  *   Delete resource control set and all attached values.
1289  *
1290  * Return values
1291  *   No value returned.
1292  *
1293  * Caller's context
1294  *   No restrictions on context.
1295  */
1296 void
1297 rctl_set_free(rctl_set_t *set)
1298 {
1299 	uint_t i;
1300 
1301 	mutex_enter(&set->rcs_lock);
1302 	for (i = 0; i < rctl_set_size; i++) {
1303 		rctl_t *r = set->rcs_ctls[i];
1304 
1305 		while (r != NULL) {
1306 			rctl_val_t *v = r->rc_values;
1307 			rctl_t *n = r->rc_next;
1308 
1309 			kmem_cache_free(rctl_cache, r);
1310 
1311 			rctl_val_list_free(v);
1312 
1313 			r = n;
1314 		}
1315 	}
1316 	mutex_exit(&set->rcs_lock);
1317 
1318 	kmem_free(set->rcs_ctls, sizeof (rctl_t *) * rctl_set_size);
1319 	kmem_free(set, sizeof (rctl_set_t));
1320 }
1321 
1322 /*
1323  * void rctl_set_reset(rctl_set_t *)
1324  *
1325  * Overview
1326  *   Resets all rctls within the set such that the lowest value becomes active.
1327  *
1328  * Return values
1329  *   No value returned.
1330  *
1331  * Caller's context
1332  *   No restrictions on context.
1333  */
1334 void
1335 rctl_set_reset(rctl_set_t *set, struct proc *p, rctl_entity_p_t *e)
1336 {
1337 	uint_t i;
1338 
1339 	ASSERT(e);
1340 
1341 	mutex_enter(&set->rcs_lock);
1342 	for (i = 0; i < rctl_set_size; i++) {
1343 		rctl_t *r = set->rcs_ctls[i];
1344 
1345 		while (r != NULL) {
1346 			r->rc_cursor = r->rc_values;
1347 			rctl_val_list_reset(r->rc_cursor);
1348 			RCTLOP_SET(r, p, e, rctl_model_value(r->rc_dict_entry,
1349 			    p, r->rc_cursor->rcv_value));
1350 
1351 			ASSERT(r->rc_cursor != NULL);
1352 
1353 			r = r->rc_next;
1354 		}
1355 	}
1356 
1357 	mutex_exit(&set->rcs_lock);
1358 }
1359 
1360 /*
1361  * void rctl_set_tearoff(rctl_set *, struct proc *)
1362  *
1363  * Overview
1364  *   Tear off any resource control values on this set with an action recipient
1365  *   equal to the specified process (as they are becoming invalid with the
1366  *   process's departure from this set as an observer).
1367  *
1368  * Return values
1369  *   No value returned.
1370  *
1371  * Caller's context
1372  *   No restrictions on context
1373  */
1374 void
1375 rctl_set_tearoff(rctl_set_t *set, struct proc *p)
1376 {
1377 	uint_t i;
1378 
1379 	mutex_enter(&set->rcs_lock);
1380 	for (i = 0; i < rctl_set_size; i++) {
1381 		rctl_t *r = set->rcs_ctls[i];
1382 
1383 		while (r != NULL) {
1384 			rctl_val_t *rval;
1385 
1386 tearoff_rewalk_list:
1387 			rval = r->rc_values;
1388 
1389 			while (rval != NULL) {
1390 				if (rval->rcv_privilege == RCPRIV_BASIC &&
1391 				    rval->rcv_action_recipient == p) {
1392 					if (r->rc_cursor == rval)
1393 						r->rc_cursor = rval->rcv_next;
1394 
1395 					(void) rctl_val_list_delete(
1396 					    &r->rc_values, rval);
1397 
1398 					goto tearoff_rewalk_list;
1399 				}
1400 
1401 				rval = rval->rcv_next;
1402 			}
1403 
1404 			ASSERT(r->rc_cursor != NULL);
1405 
1406 			r = r->rc_next;
1407 		}
1408 	}
1409 
1410 	mutex_exit(&set->rcs_lock);
1411 }
1412 
1413 static int
1414 rctl_set_find(rctl_set_t *set, rctl_hndl_t hndl, rctl_t **rctl)
1415 {
1416 	uint_t index = hndl % rctl_set_size;
1417 	rctl_t *curr_ctl;
1418 
1419 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1420 
1421 	for (curr_ctl = set->rcs_ctls[index]; curr_ctl != NULL;
1422 	    curr_ctl = curr_ctl->rc_next) {
1423 		if (curr_ctl->rc_id == hndl) {
1424 			*rctl = curr_ctl;
1425 
1426 			return (0);
1427 		}
1428 	}
1429 
1430 	return (-1);
1431 }
1432 
1433 /*
1434  * rlim64_t rctl_enforced_value(rctl_hndl_t, rctl_set_t *, struct proc *)
1435  *
1436  * Overview
1437  *   Given a process, get the next enforced value on the rctl of the specified
1438  *   handle.
1439  *
1440  * Return value
1441  *   The enforced value.
1442  *
1443  * Caller's context
1444  *   For controls on process collectives, p->p_lock must be held across the
1445  *   operation.
1446  */
1447 /*ARGSUSED*/
1448 rctl_qty_t
1449 rctl_enforced_value(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p)
1450 {
1451 	rctl_t *rctl;
1452 	rlim64_t ret;
1453 
1454 	mutex_enter(&rset->rcs_lock);
1455 
1456 	if (rctl_set_find(rset, hndl, &rctl) == -1)
1457 		panic("unknown resource control handle %d requested", hndl);
1458 	else
1459 		ret = rctl_model_value(rctl->rc_dict_entry, p,
1460 		    rctl->rc_cursor->rcv_value);
1461 
1462 	mutex_exit(&rset->rcs_lock);
1463 
1464 	return (ret);
1465 }
1466 
1467 /*
1468  * int rctl_global_get(const char *, rctl_dict_entry_t *)
1469  *
1470  * Overview
1471  *   Copy a sanitized version of the global rctl for a given resource control
1472  *   name.  (By sanitization, we mean that the unsafe data pointers have been
1473  *   zeroed.)
1474  *
1475  * Return value
1476  *   -1 if name not defined, 0 otherwise.
1477  *
1478  * Caller's context
1479  *   No restrictions on context.  rctl_dict_lock must not be held.
1480  */
1481 int
1482 rctl_global_get(const char *name, rctl_dict_entry_t *drde)
1483 {
1484 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1485 
1486 	if (rde == NULL)
1487 		return (-1);
1488 
1489 	bcopy(rde, drde, sizeof (rctl_dict_entry_t));
1490 
1491 	drde->rcd_next = NULL;
1492 	drde->rcd_ops = NULL;
1493 
1494 	return (0);
1495 }
1496 
1497 /*
1498  * int rctl_global_set(const char *, rctl_dict_entry_t *)
1499  *
1500  * Overview
1501  *   Transfer the settable fields of the named rctl to the global rctl matching
1502  *   the given resource control name.
1503  *
1504  * Return value
1505  *   -1 if name not defined, 0 otherwise.
1506  *
1507  * Caller's context
1508  *   No restrictions on context.  rctl_dict_lock must not be held.
1509  */
1510 int
1511 rctl_global_set(const char *name, rctl_dict_entry_t *drde)
1512 {
1513 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1514 
1515 	if (rde == NULL)
1516 		return (-1);
1517 
1518 	rde->rcd_flagaction = drde->rcd_flagaction;
1519 	rde->rcd_syslog_level = drde->rcd_syslog_level;
1520 	rde->rcd_strlog_flags = drde->rcd_strlog_flags;
1521 
1522 	return (0);
1523 }
1524 
1525 static int
1526 rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1527     int (*cbop)(rctl_hndl_t, struct proc *p, rctl_entity_p_t *e, rctl_t *,
1528     rctl_val_t *, rctl_val_t *), struct proc *p)
1529 {
1530 	rctl_t *rctl;
1531 	rctl_set_t *rset;
1532 	rctl_entity_p_t e;
1533 	int ret = 0;
1534 	rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
1535 
1536 local_op_retry:
1537 
1538 	ASSERT(MUTEX_HELD(&p->p_lock));
1539 
1540 	rset = rctl_entity_obtain_rset(rde, p);
1541 
1542 	if (rset == NULL) {
1543 		return (-1);
1544 	}
1545 	rctl_entity_obtain_entity_p(rset->rcs_entity, p, &e);
1546 
1547 	mutex_enter(&rset->rcs_lock);
1548 
1549 	/* using rctl's hndl, get rctl from local set */
1550 	if (rctl_set_find(rset, hndl, &rctl) == -1) {
1551 		mutex_exit(&rset->rcs_lock);
1552 		return (-1);
1553 	}
1554 
1555 	ret = cbop(hndl, p, &e, rctl, oval, nval);
1556 
1557 	mutex_exit(&rset->rcs_lock);
1558 	return (ret);
1559 }
1560 
1561 /*ARGSUSED*/
1562 static int
1563 rctl_local_get_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1564     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1565 {
1566 	if (oval == NULL) {
1567 		/*
1568 		 * RCTL_FIRST
1569 		 */
1570 		bcopy(rctl->rc_values, nval, sizeof (rctl_val_t));
1571 	} else {
1572 		/*
1573 		 * RCTL_NEXT
1574 		 */
1575 		rctl_val_t *tval = rctl_val_list_find(&rctl->rc_values, oval);
1576 
1577 		if (tval == NULL)
1578 			return (ESRCH);
1579 		else if (tval->rcv_next == NULL)
1580 			return (ENOENT);
1581 		else
1582 			bcopy(tval->rcv_next, nval, sizeof (rctl_val_t));
1583 	}
1584 
1585 	return (0);
1586 }
1587 
1588 /*
1589  * int rctl_local_get(rctl_hndl_t, rctl_val_t *)
1590  *
1591  * Overview
1592  *   Get the rctl value for the given flags.
1593  *
1594  * Return values
1595  *   0 for successful get, errno otherwise.
1596  */
1597 int
1598 rctl_local_get(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1599     struct proc *p)
1600 {
1601 	return (rctl_local_op(hndl, oval, nval, rctl_local_get_cb, p));
1602 }
1603 
1604 /*ARGSUSED*/
1605 static int
1606 rctl_local_delete_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1607     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1608 {
1609 	if ((oval = rctl_val_list_find(&rctl->rc_values, nval)) == NULL)
1610 		return (ESRCH);
1611 
1612 	if (rctl->rc_cursor == oval) {
1613 		rctl->rc_cursor = oval->rcv_next;
1614 		rctl_val_list_reset(rctl->rc_cursor);
1615 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1616 		    rctl->rc_cursor->rcv_value));
1617 
1618 		ASSERT(rctl->rc_cursor != NULL);
1619 	}
1620 
1621 	(void) rctl_val_list_delete(&rctl->rc_values, oval);
1622 
1623 	return (0);
1624 }
1625 
1626 /*
1627  * int rctl_local_delete(rctl_hndl_t, rctl_val_t *)
1628  *
1629  * Overview
1630  *   Delete the rctl value for the given flags.
1631  *
1632  * Return values
1633  *   0 for successful delete, errno otherwise.
1634  */
1635 int
1636 rctl_local_delete(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1637 {
1638 	return (rctl_local_op(hndl, NULL, val, rctl_local_delete_cb, p));
1639 }
1640 
1641 /*
1642  * rctl_local_insert_cb()
1643  *
1644  * Overview
1645  *   Insert a new value into the rctl's val list. If an error occurs,
1646  *   the val list must be left in the same state as when the function
1647  *   was entered.
1648  *
1649  * Return Values
1650  *   0 for successful insert, EINVAL if the value is duplicated in the
1651  *   existing list.
1652  */
1653 /*ARGSUSED*/
1654 static int
1655 rctl_local_insert_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1656     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1657 {
1658 	/*
1659 	 * Before inserting, confirm there are no duplicates of this value
1660 	 * and flag level. If there is a duplicate, flag an error and do
1661 	 * nothing.
1662 	 */
1663 	if (rctl_val_list_insert(&rctl->rc_values, nval) != 0)
1664 		return (EINVAL);
1665 
1666 	if (rctl_val_cmp(nval, rctl->rc_cursor, 0) < 0) {
1667 		rctl->rc_cursor = nval;
1668 		rctl_val_list_reset(rctl->rc_cursor);
1669 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1670 		    rctl->rc_cursor->rcv_value));
1671 
1672 		ASSERT(rctl->rc_cursor != NULL);
1673 	}
1674 
1675 	return (0);
1676 }
1677 
1678 /*
1679  * int rctl_local_insert(rctl_hndl_t, rctl_val_t *)
1680  *
1681  * Overview
1682  *   Insert the rctl value into the appropriate rctl set for the calling
1683  *   process, given the handle.
1684  */
1685 int
1686 rctl_local_insert(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1687 {
1688 	return (rctl_local_op(hndl, NULL, val, rctl_local_insert_cb, p));
1689 }
1690 
1691 static int
1692 rctl_local_replace_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1693     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1694 {
1695 	int ret;
1696 	rctl_val_t *tmp;
1697 
1698 	/* Verify that old will be delete-able */
1699 	tmp = rctl_val_list_find(&rctl->rc_values, oval);
1700 	if (tmp == NULL)
1701 		return (ESRCH);
1702 	/*
1703 	 * Caller should verify that value being deleted is not the
1704 	 * system value.
1705 	 */
1706 	ASSERT(tmp->rcv_privilege != RCPRIV_SYSTEM);
1707 
1708 	/*
1709 	 * rctl_local_insert_cb() does the job of flagging an error
1710 	 * for any duplicate values. So, call rctl_local_insert_cb()
1711 	 * for the new value first, then do deletion of the old value.
1712 	 * Since this is a callback function to rctl_local_op, we can
1713 	 * count on rcs_lock being held at this point. This guarantees
1714 	 * that there is at no point a visible list which contains both
1715 	 * new and old values.
1716 	 */
1717 	if (ret = rctl_local_insert_cb(hndl, p, e, rctl, NULL, nval))
1718 		return (ret);
1719 
1720 	ret = rctl_local_delete_cb(hndl, p, e, rctl, NULL, oval);
1721 	ASSERT(ret == 0);
1722 	return (0);
1723 }
1724 
1725 /*
1726  * int rctl_local_replace(rctl_hndl_t, void *, int, uint64_t *)
1727  *
1728  * Overview
1729  *   Replace the rctl value with a new one.
1730  *
1731  * Return values
1732  *   0 for successful replace, errno otherwise.
1733  */
1734 int
1735 rctl_local_replace(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1736     struct proc *p)
1737 {
1738 	return (rctl_local_op(hndl, oval, nval, rctl_local_replace_cb, p));
1739 }
1740 
1741 /*
1742  * int rctl_rlimit_get(rctl_hndl_t, struct proc *, struct rlimit64 *)
1743  *
1744  * Overview
1745  *   To support rlimit compatibility, we need a function which takes a 64-bit
1746  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
1747  *   This operation is only intended for legacy rlimits.
1748  */
1749 int
1750 rctl_rlimit_get(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64)
1751 {
1752 	rctl_t *rctl;
1753 	rctl_val_t *rval;
1754 	rctl_set_t *rset = p->p_rctls;
1755 	int soft_limit_seen = 0;
1756 	int test_for_deny = 1;
1757 
1758 	mutex_enter(&rset->rcs_lock);
1759 	if (rctl_set_find(rset, rc, &rctl) == -1) {
1760 		mutex_exit(&rset->rcs_lock);
1761 		return (-1);
1762 	}
1763 
1764 	rval = rctl->rc_values;
1765 
1766 	if (rctl->rc_dict_entry->rcd_flagaction & (RCTL_GLOBAL_DENY_NEVER |
1767 	    RCTL_GLOBAL_DENY_ALWAYS))
1768 		test_for_deny = 0;
1769 
1770 	/*
1771 	 * 1.  Find the first control value with the RCTL_LOCAL_DENY bit set.
1772 	 */
1773 	while (rval != NULL && rval->rcv_privilege != RCPRIV_SYSTEM) {
1774 		if (test_for_deny &&
1775 		    (rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0) {
1776 			rval = rval->rcv_next;
1777 			continue;
1778 		}
1779 
1780 		/*
1781 		 * 2.  If this is an RCPRIV_BASIC value, then we've found the
1782 		 * effective soft limit and should set rlim_cur.  We should then
1783 		 * continue looking for another control value with the DENY bit
1784 		 * set.
1785 		 */
1786 		if (rval->rcv_privilege == RCPRIV_BASIC) {
1787 			if (soft_limit_seen) {
1788 				rval = rval->rcv_next;
1789 				continue;
1790 			}
1791 
1792 			if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
1793 			    rval->rcv_value < rctl_model_maximum(
1794 			    rctl->rc_dict_entry, p))
1795 				rlp64->rlim_cur = rval->rcv_value;
1796 			else
1797 				rlp64->rlim_cur = RLIM64_INFINITY;
1798 			soft_limit_seen = 1;
1799 
1800 			rval = rval->rcv_next;
1801 			continue;
1802 		}
1803 
1804 		/*
1805 		 * 3.  This is an RCPRIV_PRIVILEGED value.  If we haven't found
1806 		 * a soft limit candidate, then we've found the effective hard
1807 		 * and soft limits and should set both  If we had found a soft
1808 		 * limit, then this is only the hard limit and we need only set
1809 		 * rlim_max.
1810 		 */
1811 		if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
1812 		    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry,
1813 		    p))
1814 			rlp64->rlim_max = rval->rcv_value;
1815 		else
1816 			rlp64->rlim_max = RLIM64_INFINITY;
1817 		if (!soft_limit_seen)
1818 			rlp64->rlim_cur = rlp64->rlim_max;
1819 
1820 		mutex_exit(&rset->rcs_lock);
1821 		return (0);
1822 	}
1823 
1824 	if (rval == NULL) {
1825 		/*
1826 		 * This control sequence is corrupt, as it is not terminated by
1827 		 * a system privileged control value.
1828 		 */
1829 		mutex_exit(&rset->rcs_lock);
1830 		return (-1);
1831 	}
1832 
1833 	/*
1834 	 * 4.  If we run into a RCPRIV_SYSTEM value, then the hard limit (and
1835 	 * the soft, if we haven't a soft candidate) should be the value of the
1836 	 * system control value.
1837 	 */
1838 	if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
1839 	    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry, p))
1840 		rlp64->rlim_max = rval->rcv_value;
1841 	else
1842 		rlp64->rlim_max = RLIM64_INFINITY;
1843 
1844 	if (!soft_limit_seen)
1845 		rlp64->rlim_cur = rlp64->rlim_max;
1846 
1847 	mutex_exit(&rset->rcs_lock);
1848 	return (0);
1849 }
1850 
1851 /*
1852  * rctl_alloc_gp_t *rctl_rlimit_set_prealloc(uint_t)
1853  *
1854  * Overview
1855  *   Before making a series of calls to rctl_rlimit_set(), we must have a
1856  *   preallocated batch of resource control values, as rctl_rlimit_set() can
1857  *   potentially consume two resource control values per call.
1858  *
1859  * Return values
1860  *   A populated resource control allocation group with 2n resource control
1861  *   values.
1862  *
1863  * Caller's context
1864  *   Must be safe for KM_SLEEP allocations.
1865  */
1866 rctl_alloc_gp_t *
1867 rctl_rlimit_set_prealloc(uint_t n)
1868 {
1869 	rctl_alloc_gp_t *gp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1870 
1871 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1872 
1873 	gp->rcag_nvals = 2 * n;
1874 
1875 	rctl_gp_alloc(gp);
1876 
1877 	return (gp);
1878 }
1879 
1880 /*
1881  * int rctl_rlimit_set(rctl_hndl_t, struct proc *, struct rlimit64 *, int,
1882  *   int)
1883  *
1884  * Overview
1885  *   To support rlimit compatibility, we need a function which takes a 64-bit
1886  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
1887  *   This operation is only intended for legacy rlimits.
1888  *
1889  *   The implementation of rctl_rlimit_set() is a bit clever, as it tries to
1890  *   minimize the number of values placed on the value sequence in various
1891  *   cases.  Furthermore, we don't allow multiple identical privilege-action
1892  *   values on the same sequence.  (That is, we don't want a sequence like
1893  *   "while (1) { rlim.rlim_cur++; setrlimit(..., rlim); }" to exhaust kernel
1894  *   memory.)  So we want to delete any values with the same privilege value and
1895  *   action.
1896  *
1897  * Return values
1898  *   0 for successful set, errno otherwise. Errno will be either EINVAL
1899  *   or EPERM, in keeping with defined errnos for ulimit() and setrlimit()
1900  *   system calls.
1901  */
1902 /*ARGSUSED*/
1903 int
1904 rctl_rlimit_set(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64,
1905     rctl_alloc_gp_t *ragp, int flagaction, int signal, const cred_t *cr)
1906 {
1907 	rctl_t *rctl;
1908 	rctl_val_t *rval, *rval_priv, *rval_basic;
1909 	rctl_set_t *rset = p->p_rctls;
1910 	rctl_qty_t max;
1911 	rctl_entity_p_t e;
1912 	struct rlimit64 cur_rl;
1913 
1914 	e.rcep_t = RCENTITY_PROCESS;
1915 	e.rcep_p.proc = p;
1916 
1917 	if (rlp64->rlim_cur > rlp64->rlim_max)
1918 		return (EINVAL);
1919 
1920 	if (rctl_rlimit_get(rc, p, &cur_rl) == -1)
1921 		return (EINVAL);
1922 
1923 	/*
1924 	 * If we are not privileged, we can only lower the hard limit.
1925 	 */
1926 	if ((rlp64->rlim_max > cur_rl.rlim_max) &&
1927 	    cur_rl.rlim_max != RLIM64_INFINITY &&
1928 	    secpolicy_resource(cr) != 0)
1929 		return (EPERM);
1930 
1931 	mutex_enter(&rset->rcs_lock);
1932 
1933 	if (rctl_set_find(rset, rc, &rctl) == -1) {
1934 		mutex_exit(&rset->rcs_lock);
1935 		return (EINVAL);
1936 	}
1937 
1938 	rval_priv = rctl_gp_detach_val(ragp);
1939 
1940 	rval = rctl->rc_values;
1941 
1942 	while (rval != NULL) {
1943 		rctl_val_t *next = rval->rcv_next;
1944 
1945 		if (rval->rcv_privilege == RCPRIV_SYSTEM)
1946 			break;
1947 
1948 		if ((rval->rcv_privilege == RCPRIV_BASIC) ||
1949 		    (rval->rcv_flagaction & ~RCTL_LOCAL_ACTION_MASK) ==
1950 		    (flagaction & ~RCTL_LOCAL_ACTION_MASK)) {
1951 			if (rctl->rc_cursor == rval) {
1952 				rctl->rc_cursor = rval->rcv_next;
1953 				rctl_val_list_reset(rctl->rc_cursor);
1954 				RCTLOP_SET(rctl, p, &e, rctl_model_value(
1955 				    rctl->rc_dict_entry, p,
1956 				    rctl->rc_cursor->rcv_value));
1957 			}
1958 			(void) rctl_val_list_delete(&rctl->rc_values, rval);
1959 		}
1960 
1961 		rval = next;
1962 	}
1963 
1964 	rval_priv->rcv_privilege = RCPRIV_PRIVILEGED;
1965 	rval_priv->rcv_flagaction = flagaction;
1966 	if (rlp64->rlim_max == RLIM64_INFINITY) {
1967 		rval_priv->rcv_flagaction |= RCTL_LOCAL_MAXIMAL;
1968 		max = rctl->rc_dict_entry->rcd_max_native;
1969 	} else {
1970 		max = rlp64->rlim_max;
1971 	}
1972 	rval_priv->rcv_value = max;
1973 	rval_priv->rcv_action_signal = signal;
1974 	rval_priv->rcv_action_recipient = NULL;
1975 	rval_priv->rcv_action_recip_pid = -1;
1976 	rval_priv->rcv_firing_time = 0;
1977 	rval_priv->rcv_prev = rval_priv->rcv_next = NULL;
1978 
1979 	(void) rctl_val_list_insert(&rctl->rc_values, rval_priv);
1980 	rctl->rc_cursor = rval_priv;
1981 	rctl_val_list_reset(rctl->rc_cursor);
1982 	RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
1983 	    rctl->rc_cursor->rcv_value));
1984 
1985 	if (rlp64->rlim_cur != RLIM64_INFINITY && rlp64->rlim_cur < max) {
1986 		rval_basic = rctl_gp_detach_val(ragp);
1987 
1988 		rval_basic->rcv_privilege = RCPRIV_BASIC;
1989 		rval_basic->rcv_value = rlp64->rlim_cur;
1990 		rval_basic->rcv_flagaction = flagaction;
1991 		rval_basic->rcv_action_signal = signal;
1992 		rval_basic->rcv_action_recipient = p;
1993 		rval_basic->rcv_action_recip_pid = p->p_pid;
1994 		rval_basic->rcv_firing_time = 0;
1995 		rval_basic->rcv_prev = rval_basic->rcv_next = NULL;
1996 
1997 		(void) rctl_val_list_insert(&rctl->rc_values, rval_basic);
1998 		rctl->rc_cursor = rval_basic;
1999 		rctl_val_list_reset(rctl->rc_cursor);
2000 		RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2001 		    rctl->rc_cursor->rcv_value));
2002 	}
2003 
2004 	ASSERT(rctl->rc_cursor != NULL);
2005 
2006 	mutex_exit(&rset->rcs_lock);
2007 	return (0);
2008 }
2009 
2010 
2011 /*
2012  * rctl_hndl_t rctl_register(const char *, rctl_entity_t, int, rlim64_t,
2013  *   rlim64_t, rctl_ops_t *)
2014  *
2015  * Overview
2016  *   rctl_register() performs a look-up in the dictionary of rctls
2017  *   active on the system; if a rctl of that name is absent, an entry is
2018  *   made into the dictionary.  The rctl is returned with its reference
2019  *   count incremented by one.  If the rctl name already exists, we panic.
2020  *   (Were the resource control system to support dynamic loading and unloading,
2021  *   which it is structured for, duplicate registration should lead to load
2022  *   failure instead of panicking.)
2023  *
2024  *   Each registered rctl has a requirement that a RCPRIV_SYSTEM limit be
2025  *   defined.  This limit contains the highest possible value for this quantity
2026  *   on the system.  Furthermore, the registered control must provide infinite
2027  *   values for all applicable address space models supported by the operating
2028  *   system.  Attempts to set resource control values beyond the system limit
2029  *   will fail.
2030  *
2031  * Return values
2032  *   The rctl's ID.
2033  *
2034  * Caller's context
2035  *   Caller must be in a context suitable for KM_SLEEP allocations.
2036  */
2037 rctl_hndl_t
2038 rctl_register(
2039     const char *name,
2040     rctl_entity_t entity,
2041     int global_flags,
2042     rlim64_t max_native,
2043     rlim64_t max_ilp32,
2044     rctl_ops_t *ops)
2045 {
2046 	rctl_t *rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
2047 	rctl_val_t *rctl_val = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2048 	rctl_dict_entry_t *rctl_de = kmem_zalloc(sizeof (rctl_dict_entry_t),
2049 	    KM_SLEEP);
2050 	rctl_t *old_rctl;
2051 	rctl_hndl_t rhndl;
2052 	int localflags;
2053 
2054 	ASSERT(ops != NULL);
2055 
2056 	bzero(rctl, sizeof (rctl_t));
2057 	bzero(rctl_val, sizeof (rctl_val_t));
2058 
2059 	if (global_flags & RCTL_GLOBAL_DENY_NEVER)
2060 		localflags = RCTL_LOCAL_MAXIMAL;
2061 	else
2062 		localflags = RCTL_LOCAL_MAXIMAL | RCTL_LOCAL_DENY;
2063 
2064 	rctl_val->rcv_privilege = RCPRIV_SYSTEM;
2065 	rctl_val->rcv_value = max_native;
2066 	rctl_val->rcv_flagaction = localflags;
2067 	rctl_val->rcv_action_signal = 0;
2068 	rctl_val->rcv_action_recipient = NULL;
2069 	rctl_val->rcv_action_recip_pid = -1;
2070 	rctl_val->rcv_firing_time = 0;
2071 	rctl_val->rcv_next = NULL;
2072 	rctl_val->rcv_prev = NULL;
2073 
2074 	rctl_de->rcd_name = (char *)name;
2075 	rctl_de->rcd_default_value = rctl_val;
2076 	rctl_de->rcd_max_native = max_native;
2077 	rctl_de->rcd_max_ilp32 = max_ilp32;
2078 	rctl_de->rcd_entity = entity;
2079 	rctl_de->rcd_ops = ops;
2080 	rctl_de->rcd_flagaction = global_flags;
2081 
2082 	rctl->rc_dict_entry = rctl_de;
2083 	rctl->rc_values = rctl_val;
2084 
2085 	/*
2086 	 * 1.  Take global lock, validate nonexistence of name, get ID.
2087 	 */
2088 	mutex_enter(&rctl_dict_lock);
2089 
2090 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
2091 	    (mod_hash_val_t *)&rhndl) != MH_ERR_NOTFOUND)
2092 		panic("duplicate registration of rctl %s", name);
2093 
2094 	rhndl = rctl_de->rcd_id = rctl->rc_id =
2095 	    (rctl_hndl_t)id_alloc(rctl_ids);
2096 
2097 	/*
2098 	 * 2.  Insert name-entry pair in rctl_dict_by_name.
2099 	 */
2100 	if (mod_hash_insert(rctl_dict_by_name, (mod_hash_key_t)name,
2101 	    (mod_hash_val_t)rctl_de))
2102 		panic("unable to insert rctl dict entry for %s (%u)", name,
2103 		    (uint_t)rctl->rc_id);
2104 
2105 	/*
2106 	 * 3.  Insert ID-rctl_t * pair in rctl_dict.
2107 	 */
2108 	if (mod_hash_find(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2109 	    (mod_hash_val_t *)&old_rctl) != MH_ERR_NOTFOUND)
2110 		panic("duplicate rctl ID %u registered", rctl->rc_id);
2111 
2112 	if (mod_hash_insert(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2113 	    (mod_hash_val_t)rctl))
2114 		panic("unable to insert rctl %s/%u (%p)", name,
2115 		    (uint_t)rctl->rc_id, rctl);
2116 
2117 	/*
2118 	 * 3a. Insert rctl_dict_entry_t * in appropriate entity list.
2119 	 */
2120 
2121 	mutex_enter(&rctl_lists_lock);
2122 
2123 	switch (entity) {
2124 	case RCENTITY_ZONE:
2125 	case RCENTITY_PROJECT:
2126 	case RCENTITY_TASK:
2127 	case RCENTITY_PROCESS:
2128 		rctl_de->rcd_next = rctl_lists[entity];
2129 		rctl_lists[entity] = rctl_de;
2130 		break;
2131 	default:
2132 		panic("registering unknown rctl entity %d (%s)", entity,
2133 		    name);
2134 		break;
2135 	}
2136 
2137 	mutex_exit(&rctl_lists_lock);
2138 
2139 	/*
2140 	 * 4.  Drop lock.
2141 	 */
2142 	mutex_exit(&rctl_dict_lock);
2143 
2144 	return (rhndl);
2145 }
2146 
2147 /*
2148  * static int rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p,
2149  *    rctl_val_t *v)
2150  *
2151  * Overview
2152  *   rctl_global_action() takes, in according with the flags on the rctl_dict
2153  *   entry for the given control, the appropriate actions on the exceeded
2154  *   control value.  Additionally, rctl_global_action() updates the firing time
2155  *   on the exceeded value.
2156  *
2157  * Return values
2158  *   A bitmask reflecting the actions actually taken.
2159  *
2160  * Caller's context
2161  *   No restrictions on context.
2162  */
2163 /*ARGSUSED*/
2164 static int
2165 rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v)
2166 {
2167 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2168 	const char *pr, *en, *idstr;
2169 	id_t id;
2170 	enum {
2171 		SUFFIX_NONE,	/* id consumed directly */
2172 		SUFFIX_NUMERIC,	/* id consumed in suffix */
2173 		SUFFIX_STRING	/* idstr consumed in suffix */
2174 	} suffix = SUFFIX_NONE;
2175 	int ret = 0;
2176 
2177 	v->rcv_firing_time = gethrtime();
2178 
2179 	switch (v->rcv_privilege) {
2180 	case RCPRIV_BASIC:
2181 		pr = "basic";
2182 		break;
2183 	case RCPRIV_PRIVILEGED:
2184 		pr = "privileged";
2185 		break;
2186 	case RCPRIV_SYSTEM:
2187 		pr = "system";
2188 		break;
2189 	default:
2190 		pr = "unknown";
2191 		break;
2192 	}
2193 
2194 	switch (rde->rcd_entity) {
2195 	case RCENTITY_PROCESS:
2196 		en = "process";
2197 		id = p->p_pid;
2198 		suffix = SUFFIX_NONE;
2199 		break;
2200 	case RCENTITY_TASK:
2201 		en = "task";
2202 		id = p->p_task->tk_tkid;
2203 		suffix = SUFFIX_NUMERIC;
2204 		break;
2205 	case RCENTITY_PROJECT:
2206 		en = "project";
2207 		id = p->p_task->tk_proj->kpj_id;
2208 		suffix = SUFFIX_NUMERIC;
2209 		break;
2210 	case RCENTITY_ZONE:
2211 		en = "zone";
2212 		idstr = p->p_zone->zone_name;
2213 		suffix = SUFFIX_STRING;
2214 		break;
2215 	default:
2216 		en = "unknown entity associated with process";
2217 		id = p->p_pid;
2218 		suffix = SUFFIX_NONE;
2219 		break;
2220 	}
2221 
2222 	if (rde->rcd_flagaction & RCTL_GLOBAL_SYSLOG) {
2223 		switch (suffix) {
2224 		default:
2225 		case SUFFIX_NONE:
2226 			(void) strlog(0, 0, 0,
2227 			    rde->rcd_strlog_flags | log_global.lz_active,
2228 			    "%s rctl %s (value %llu) exceeded by %s %d.",
2229 			    pr, rde->rcd_name, v->rcv_value, en, id);
2230 			break;
2231 		case SUFFIX_NUMERIC:
2232 			(void) strlog(0, 0, 0,
2233 			    rde->rcd_strlog_flags | log_global.lz_active,
2234 			    "%s rctl %s (value %llu) exceeded by process %d"
2235 			    " in %s %d.",
2236 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2237 			    en, id);
2238 			break;
2239 		case SUFFIX_STRING:
2240 			(void) strlog(0, 0, 0,
2241 			    rde->rcd_strlog_flags | log_global.lz_active,
2242 			    "%s rctl %s (value %llu) exceeded by process %d"
2243 			    " in %s %s.",
2244 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2245 			    en, idstr);
2246 			break;
2247 		}
2248 	}
2249 
2250 	if (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS)
2251 		ret |= RCT_DENY;
2252 
2253 	return (ret);
2254 }
2255 
2256 static int
2257 rctl_local_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v,
2258     uint_t safety)
2259 {
2260 	int ret = 0;
2261 	sigqueue_t *sqp = NULL;
2262 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2263 	int unobservable = (rde->rcd_flagaction & RCTL_GLOBAL_UNOBSERVABLE);
2264 
2265 	proc_t *recipient = v->rcv_action_recipient;
2266 	id_t recip_pid = v->rcv_action_recip_pid;
2267 	int recip_signal = v->rcv_action_signal;
2268 	uint_t flagaction = v->rcv_flagaction;
2269 
2270 	if (safety == RCA_UNSAFE_ALL) {
2271 		if (flagaction & RCTL_LOCAL_DENY) {
2272 			ret |= RCT_DENY;
2273 		}
2274 		return (ret);
2275 	}
2276 
2277 	if (flagaction & RCTL_LOCAL_SIGNAL) {
2278 		/*
2279 		 * We can build a siginfo only in the case that it is
2280 		 * safe for us to drop p_lock.  (For asynchronous
2281 		 * checks this is currently not true.)
2282 		 */
2283 		if (safety == RCA_SAFE) {
2284 			mutex_exit(&rset->rcs_lock);
2285 			mutex_exit(&p->p_lock);
2286 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
2287 			mutex_enter(&p->p_lock);
2288 			mutex_enter(&rset->rcs_lock);
2289 
2290 			sqp->sq_info.si_signo = recip_signal;
2291 			sqp->sq_info.si_code = SI_RCTL;
2292 			sqp->sq_info.si_errno = 0;
2293 			sqp->sq_info.si_entity = (int)rde->rcd_entity;
2294 		}
2295 
2296 		if (recipient == NULL || recipient == p) {
2297 			ret |= RCT_SIGNAL;
2298 
2299 			if (sqp == NULL) {
2300 				sigtoproc(p, NULL, recip_signal);
2301 			} else if (p == curproc) {
2302 				/*
2303 				 * Then this is a synchronous test and we can
2304 				 * direct the signal at the violating thread.
2305 				 */
2306 				sigaddqa(curproc, curthread, sqp);
2307 			} else {
2308 				sigaddqa(p, NULL, sqp);
2309 			}
2310 		} else if (!unobservable) {
2311 			proc_t *rp;
2312 
2313 			mutex_exit(&rset->rcs_lock);
2314 			mutex_exit(&p->p_lock);
2315 
2316 			mutex_enter(&pidlock);
2317 			if ((rp = prfind(recip_pid)) == recipient) {
2318 				/*
2319 				 * Recipient process is still alive, but may not
2320 				 * be in this task or project any longer.  In
2321 				 * this case, the recipient's resource control
2322 				 * set pertinent to this control will have
2323 				 * changed--and we will not deliver the signal,
2324 				 * as the recipient process is trying to tear
2325 				 * itself off of its former set.
2326 				 */
2327 				mutex_enter(&rp->p_lock);
2328 				mutex_exit(&pidlock);
2329 
2330 				if (rctl_entity_obtain_rset(rde, rp) == rset) {
2331 					ret |= RCT_SIGNAL;
2332 
2333 					if (sqp == NULL)
2334 						sigtoproc(rp, NULL,
2335 						    recip_signal);
2336 					else
2337 						sigaddqa(rp, NULL, sqp);
2338 				} else if (sqp) {
2339 					kmem_free(sqp, sizeof (sigqueue_t));
2340 				}
2341 				mutex_exit(&rp->p_lock);
2342 			} else {
2343 				mutex_exit(&pidlock);
2344 				if (sqp)
2345 					kmem_free(sqp, sizeof (sigqueue_t));
2346 			}
2347 
2348 			mutex_enter(&p->p_lock);
2349 			/*
2350 			 * Since we dropped p_lock, we may no longer be in the
2351 			 * same task or project as we were at entry.  It is thus
2352 			 * unsafe for us to reacquire the set lock at this
2353 			 * point; callers of rctl_local_action() must handle
2354 			 * this possibility.
2355 			 */
2356 			ret |= RCT_LK_ABANDONED;
2357 		} else if (sqp) {
2358 			kmem_free(sqp, sizeof (sigqueue_t));
2359 		}
2360 	}
2361 
2362 	if ((flagaction & RCTL_LOCAL_DENY) &&
2363 	    (recipient == NULL || recipient == p)) {
2364 		ret |= RCT_DENY;
2365 	}
2366 
2367 	return (ret);
2368 }
2369 
2370 /*
2371  * int rctl_action(rctl_hndl_t, rctl_set_t *, struct proc *, uint_t)
2372  *
2373  * Overview
2374  *   Take the action associated with the enforced value (as defined by
2375  *   rctl_get_enforced_value()) being exceeded or encountered.  Possibly perform
2376  *   a restricted subset of the available actions, if circumstances dictate that
2377  *   we cannot safely allocate memory (for a sigqueue_t) or guarantee process
2378  *   persistence across the duration of the function (an asynchronous action).
2379  *
2380  * Return values
2381  *   Actions taken, according to the rctl_test bitmask.
2382  *
2383  * Caller's context
2384  *   Safe to acquire rcs_lock.
2385  */
2386 int
2387 rctl_action(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p, uint_t safety)
2388 {
2389 	return (rctl_action_entity(hndl, rset, p, NULL, safety));
2390 }
2391 
2392 int
2393 rctl_action_entity(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p,
2394     rctl_entity_p_t *e, uint_t safety)
2395 {
2396 	int ret = RCT_NONE;
2397 	rctl_t *lrctl;
2398 	rctl_entity_p_t e_tmp;
2399 
2400 rctl_action_acquire:
2401 	mutex_enter(&rset->rcs_lock);
2402 	if (rctl_set_find(rset, hndl, &lrctl) == -1) {
2403 		mutex_exit(&rset->rcs_lock);
2404 		return (ret);
2405 	}
2406 
2407 	if (e == NULL) {
2408 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2409 		p, &e_tmp);
2410 		e = &e_tmp;
2411 	}
2412 
2413 	if ((ret & RCT_LK_ABANDONED) == 0) {
2414 		ret |= rctl_global_action(lrctl, rset, p, lrctl->rc_cursor);
2415 
2416 		RCTLOP_ACTION(lrctl, p, e);
2417 
2418 		ret |= rctl_local_action(lrctl, rset, p,
2419 		    lrctl->rc_cursor, safety);
2420 
2421 		if (ret & RCT_LK_ABANDONED)
2422 			goto rctl_action_acquire;
2423 	}
2424 
2425 	ret &= ~RCT_LK_ABANDONED;
2426 
2427 	if (!(ret & RCT_DENY) &&
2428 	    lrctl->rc_cursor->rcv_next != NULL) {
2429 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2430 
2431 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2432 		    p, lrctl->rc_cursor->rcv_value));
2433 
2434 	}
2435 	mutex_exit(&rset->rcs_lock);
2436 
2437 	return (ret);
2438 }
2439 
2440 /*
2441  * int rctl_test(rctl_hndl_t, rctl_set_t *, struct proc *, rctl_qty_t, uint_t)
2442  *
2443  * Overview
2444  *   Increment the resource associated with the given handle, returning zero if
2445  *   the incremented value does not exceed the threshold for the current limit
2446  *   on the resource.
2447  *
2448  * Return values
2449  *   Actions taken, according to the rctl_test bitmask.
2450  *
2451  * Caller's context
2452  *   p_lock held by caller.
2453  */
2454 /*ARGSUSED*/
2455 int
2456 rctl_test(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2457     rctl_qty_t incr, uint_t flags)
2458 {
2459 	return (rctl_test_entity(rhndl, rset, p, NULL, incr, flags));
2460 }
2461 
2462 int
2463 rctl_test_entity(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2464     rctl_entity_p_t *e, rctl_qty_t incr, uint_t flags)
2465 {
2466 	rctl_t *lrctl;
2467 	int ret = RCT_NONE;
2468 	rctl_entity_p_t e_tmp;
2469 	if (p == &p0) {
2470 		/*
2471 		 * We don't enforce rctls on the kernel itself.
2472 		 */
2473 		return (ret);
2474 	}
2475 
2476 rctl_test_acquire:
2477 	ASSERT(MUTEX_HELD(&p->p_lock));
2478 
2479 	mutex_enter(&rset->rcs_lock);
2480 
2481 	/*
2482 	 * Dereference from rctl_set.  We don't enforce newly loaded controls
2483 	 * that haven't been set on this entity (since the only valid value is
2484 	 * the infinite system value).
2485 	 */
2486 	if (rctl_set_find(rset, rhndl, &lrctl) == -1) {
2487 		mutex_exit(&rset->rcs_lock);
2488 		return (ret);
2489 	}
2490 
2491 	/*
2492 	 * This control is currently unenforced:  maximal value on control
2493 	 * supporting infinitely available resource.
2494 	 */
2495 	if ((lrctl->rc_dict_entry->rcd_flagaction & RCTL_GLOBAL_INFINITE) &&
2496 	    (lrctl->rc_cursor->rcv_flagaction & RCTL_LOCAL_MAXIMAL)) {
2497 
2498 		mutex_exit(&rset->rcs_lock);
2499 		return (ret);
2500 	}
2501 
2502 	/*
2503 	 * If we have been called by rctl_test, look up the entity pointer
2504 	 * from the proc pointer.
2505 	 */
2506 	if (e == NULL) {
2507 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2508 		p, &e_tmp);
2509 		e = &e_tmp;
2510 	}
2511 
2512 	/*
2513 	 * Get enforced rctl value and current usage.  Test the increment
2514 	 * with the current usage against the enforced value--take action as
2515 	 * necessary.
2516 	 */
2517 	while (RCTLOP_TEST(lrctl, p, e, lrctl->rc_cursor, incr, flags)) {
2518 		if ((ret & RCT_LK_ABANDONED) == 0) {
2519 			ret |= rctl_global_action(lrctl, rset, p,
2520 			    lrctl->rc_cursor);
2521 
2522 			RCTLOP_ACTION(lrctl, p, e);
2523 
2524 			ret |= rctl_local_action(lrctl, rset, p,
2525 			    lrctl->rc_cursor, flags);
2526 
2527 			if (ret & RCT_LK_ABANDONED)
2528 				goto rctl_test_acquire;
2529 		}
2530 
2531 		ret &= ~RCT_LK_ABANDONED;
2532 
2533 		if ((ret & RCT_DENY) == RCT_DENY ||
2534 		    lrctl->rc_cursor->rcv_next == NULL) {
2535 			ret |= RCT_DENY;
2536 			break;
2537 		}
2538 
2539 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2540 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2541 		    p, lrctl->rc_cursor->rcv_value));
2542 	}
2543 
2544 	mutex_exit(&rset->rcs_lock);
2545 
2546 	return (ret);
2547 }
2548 
2549 /*
2550  * void rctl_init(void)
2551  *
2552  * Overview
2553  *   Initialize the rctl subsystem, including the primoridal rctls
2554  *   provided by the system.  New subsystem-specific rctls should _not_ be
2555  *   initialized here.  (Do it in your own file.)
2556  *
2557  * Return values
2558  *   None.
2559  *
2560  * Caller's context
2561  *   Safe for KM_SLEEP allocations.  Must be called prior to any process model
2562  *   initialization.
2563  */
2564 void
2565 rctl_init(void)
2566 {
2567 	rctl_cache = kmem_cache_create("rctl_cache", sizeof (rctl_t),
2568 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2569 	rctl_val_cache = kmem_cache_create("rctl_val_cache",
2570 	    sizeof (rctl_val_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
2571 
2572 	rctl_dict = mod_hash_create_extended("rctl_dict",
2573 	    rctl_dict_size, mod_hash_null_keydtor, rctl_dict_val_dtor,
2574 	    rctl_dict_hash_by_id, NULL, rctl_dict_id_cmp, KM_SLEEP);
2575 	rctl_dict_by_name = mod_hash_create_strhash(
2576 	    "rctl_handles_by_name", rctl_dict_size,
2577 	    mod_hash_null_valdtor);
2578 	rctl_ids = id_space_create("rctl_ids", 1, max_rctl_hndl);
2579 	bzero(rctl_lists, (RC_MAX_ENTITY + 1) * sizeof (rctl_dict_entry_t *));
2580 
2581 	rctlproc_init();
2582 }
2583 
2584 /*
2585  * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc)
2586  *
2587  * Increments the amount of locked memory on a project, and
2588  * zone. If proj is NULL, the proj and zone of proc_t p is used.  If
2589  * chargeproc is non-zero, then the charged amount is cached on p->p_locked_mem
2590  * so that the charge can be migrated when a process changes projects.
2591  *
2592  * Return values
2593  *    0 - success
2594  *    EAGAIN - attempting to increment locked memory is denied by one
2595  *      or more resource entities.
2596  */
2597 int
2598 rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2599     int chargeproc)
2600 {
2601 	kproject_t *projp;
2602 	zone_t *zonep;
2603 	rctl_entity_p_t e;
2604 	int ret = 0;
2605 
2606 	ASSERT(p != NULL);
2607 	ASSERT(MUTEX_HELD(&p->p_lock));
2608 	if (proj != NULL) {
2609 		projp = proj;
2610 		zonep = zone_find_by_id(projp->kpj_zoneid);
2611 	} else {
2612 		projp = p->p_task->tk_proj;
2613 		zonep = p->p_zone;
2614 	}
2615 
2616 	mutex_enter(&zonep->zone_mem_lock);
2617 
2618 	e.rcep_p.proj = projp;
2619 	e.rcep_t = RCENTITY_PROJECT;
2620 	if (projp->kpj_data.kpd_locked_mem + inc >
2621 	    projp->kpj_data.kpd_locked_mem_ctl) {
2622 		if (rctl_test_entity(rc_project_locked_mem, projp->kpj_rctls,
2623 		    p, &e, inc, 0) & RCT_DENY) {
2624 			ret = EAGAIN;
2625 			goto out;
2626 		}
2627 	}
2628 	e.rcep_p.zone = zonep;
2629 	e.rcep_t = RCENTITY_ZONE;
2630 	if (zonep->zone_locked_mem + inc > zonep->zone_locked_mem_ctl) {
2631 		if (rctl_test_entity(rc_zone_locked_mem, zonep->zone_rctls,
2632 		    p, &e, inc, 0) & RCT_DENY) {
2633 			ret = EAGAIN;
2634 			goto out;
2635 		}
2636 	}
2637 
2638 	zonep->zone_locked_mem += inc;
2639 	projp->kpj_data.kpd_locked_mem += inc;
2640 	if (chargeproc != 0) {
2641 		p->p_locked_mem += inc;
2642 	}
2643 out:
2644 	mutex_exit(&zonep->zone_mem_lock);
2645 	if (proj != NULL)
2646 		zone_rele(zonep);
2647 	return (ret);
2648 }
2649 
2650 /*
2651  * rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc)
2652  *
2653  * Decrements the amount of locked memory on a project and
2654  * zone.  If proj is NULL, the proj and zone of proc_t p is used.  If
2655  * creditproc is non-zero, then the quantity of locked memory is subtracted
2656  * from p->p_locked_mem.
2657  *
2658  * Return values
2659  *   none
2660  */
2661 void
2662 rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2663     int creditproc)
2664 {
2665 	kproject_t *projp;
2666 	zone_t *zonep;
2667 
2668 	if (proj != NULL) {
2669 		projp = proj;
2670 		zonep = zone_find_by_id(projp->kpj_zoneid);
2671 	} else {
2672 		ASSERT(p != NULL);
2673 		ASSERT(MUTEX_HELD(&p->p_lock));
2674 		projp = p->p_task->tk_proj;
2675 		zonep = p->p_zone;
2676 	}
2677 
2678 	mutex_enter(&zonep->zone_mem_lock);
2679 	zonep->zone_locked_mem -= inc;
2680 	projp->kpj_data.kpd_locked_mem -= inc;
2681 	if (creditproc != 0) {
2682 		ASSERT(p != NULL);
2683 		ASSERT(MUTEX_HELD(&p->p_lock));
2684 		p->p_locked_mem -= inc;
2685 	}
2686 	mutex_exit(&zonep->zone_mem_lock);
2687 	if (proj != NULL)
2688 		zone_rele(zonep);
2689 }
2690 
2691 /*
2692  * rctl_incr_swap(proc_t *, zone_t *, size_t)
2693  *
2694  * Overview
2695  *   Increments the swap charge on the specified zone.
2696  *
2697  * Return values
2698  *   0 on success.  EAGAIN if swap increment fails due an rctl value
2699  *   on the zone.
2700  *
2701  * Callers context
2702  *   p_lock held on specified proc.
2703  *   swap must be even multiple of PAGESIZE
2704  */
2705 int
2706 rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap)
2707 {
2708 	rctl_entity_p_t e;
2709 
2710 	ASSERT(MUTEX_HELD(&proc->p_lock));
2711 	ASSERT((swap & PAGEOFFSET) == 0);
2712 	e.rcep_p.zone = zone;
2713 	e.rcep_t = RCENTITY_ZONE;
2714 
2715 	mutex_enter(&zone->zone_mem_lock);
2716 
2717 	if ((zone->zone_max_swap + swap) >
2718 	    zone->zone_max_swap_ctl) {
2719 
2720 		if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls,
2721 		    proc, &e, swap, 0) & RCT_DENY) {
2722 			mutex_exit(&zone->zone_mem_lock);
2723 			return (EAGAIN);
2724 		}
2725 	}
2726 	zone->zone_max_swap += swap;
2727 	mutex_exit(&zone->zone_mem_lock);
2728 	return (0);
2729 }
2730 
2731 /*
2732  * rctl_decr_swap(zone_t *, size_t)
2733  *
2734  * Overview
2735  *   Decrements the swap charge on the specified zone.
2736  *
2737  * Return values
2738  *   None
2739  *
2740  * Callers context
2741  *   swap must be even multiple of PAGESIZE
2742  */
2743 void
2744 rctl_decr_swap(zone_t *zone, size_t swap)
2745 {
2746 	ASSERT((swap & PAGEOFFSET) == 0);
2747 	mutex_enter(&zone->zone_mem_lock);
2748 	ASSERT(zone->zone_max_swap >= swap);
2749 	zone->zone_max_swap -= swap;
2750 	mutex_exit(&zone->zone_mem_lock);
2751 }
2752 
2753 /*
2754  * Create resource kstat
2755  */
2756 static kstat_t *
2757 rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class,
2758     uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid)
2759 {
2760 	kstat_t *ksp = NULL;
2761 	char name[KSTAT_STRLEN];
2762 
2763 	(void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance);
2764 
2765 	if ((ksp = kstat_create_zone("caps", ks_zoneid,
2766 		name, ks_class, ks_type,
2767 		ks_ndata, ks_flags, ks_zoneid)) != NULL) {
2768 		if (ks_zoneid != GLOBAL_ZONEID)
2769 			kstat_zone_add(ksp, GLOBAL_ZONEID);
2770 	}
2771 	return (ksp);
2772 }
2773 
2774 /*
2775  * Create zone-specific resource kstat
2776  */
2777 kstat_t *
2778 rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type,
2779     uint_t ks_ndata, uchar_t ks_flags)
2780 {
2781 	char name[KSTAT_STRLEN];
2782 
2783 	(void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name);
2784 
2785 	return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps",
2786 	    ks_type, ks_ndata, ks_flags, zone->zone_id));
2787 }
2788 
2789 /*
2790  * Create project-specific resource kstat
2791  */
2792 kstat_t *
2793 rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type,
2794     uint_t ks_ndata, uchar_t ks_flags)
2795 {
2796 	char name[KSTAT_STRLEN];
2797 
2798 	(void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name);
2799 
2800 	return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps",
2801 	    ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid));
2802 }
2803