xref: /titanic_51/usr/src/uts/common/os/rctl.c (revision 60b08185ce63023f22fd6b2ed0db8c0d119b2023)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/atomic.h>
30 #include <sys/cmn_err.h>
31 #include <sys/id_space.h>
32 #include <sys/kmem.h>
33 #include <sys/log.h>
34 #include <sys/modctl.h>
35 #include <sys/modhash.h>
36 #include <sys/mutex.h>
37 #include <sys/proc.h>
38 #include <sys/procset.h>
39 #include <sys/project.h>
40 #include <sys/resource.h>
41 #include <sys/rctl.h>
42 #include <sys/siginfo.h>
43 #include <sys/strlog.h>
44 #include <sys/systm.h>
45 #include <sys/task.h>
46 #include <sys/types.h>
47 #include <sys/policy.h>
48 #include <sys/zone.h>
49 
50 /*
51  * Resource controls (rctls)
52  *
53  *   The rctl subsystem provides a mechanism for kernel components to
54  *   register their individual resource controls with the system as a whole,
55  *   such that those controls can subscribe to specific actions while being
56  *   associated with the various process-model entities provided by the kernel:
57  *   the process, the task, the project, and the zone.  (In principle, only
58  *   minor modifications would be required to connect the resource control
59  *   functionality to non-process-model entities associated with the system.)
60  *
61  *   Subsystems register their rctls via rctl_register().  Subsystems
62  *   also wishing to provide additional limits on a given rctl can modify
63  *   them once they have the rctl handle.  Each subsystem should store the
64  *   handle to their rctl for direct access.
65  *
66  *   A primary dictionary, rctl_dict, contains a hash of id to the default
67  *   control definition for each controlled resource-entity pair on the system.
68  *   A secondary dictionary, rctl_dict_by_name, contains a hash of name to
69  *   resource control handles.  The resource control handles are distributed by
70  *   the rctl_ids ID space.  The handles are private and not to be
71  *   advertised to userland; all userland interactions are via the rctl
72  *   names.
73  *
74  *   Entities inherit their rctls from their predecessor.  Since projects have
75  *   no ancestor, they inherit their rctls from the rctl dict for project
76  *   rctls.  It is expected that project controls will be set to their
77  *   appropriate values shortly after project creation, presumably from a
78  *   policy source such as the project database.
79  *
80  * Data structures
81  *   The rctl_set_t attached to each of the process model entities is a simple
82  *   hash table keyed on the rctl handle assigned at registration.  The entries
83  *   in the hash table are rctl_t's, whose relationship with the active control
84  *   values on that resource and with the global state of the resource we
85  *   illustrate below:
86  *
87  *   rctl_dict[key] --> rctl_dict_entry
88  *			   ^
89  *			   |
90  *			+--+---+
91  *   rctl_set[key] ---> | rctl | --> value <-> value <-> system value --> NULL
92  *			+--+---+		 ^
93  *			   |			 |
94  *			   +------- cursor ------+
95  *
96  *   That is, the rctl contains a back pointer to the global resource control
97  *   state for this resource, which is also available in the rctl_dict hash
98  *   table mentioned earlier.  The rctl contains two pointers to resource
99  *   control values:  one, values, indicates the entire sequence of control
100  *   values; the other, cursor, indicates the currently active control
101  *   value--the next value to be enforced.  The value list itself is an open,
102  *   doubly-linked list, the last non-NULL member of which is the system value
103  *   for that resource (being the theoretical/conventional maximum allowable
104  *   value for the resource on this OS instance).
105  *
106  * Ops Vector
107  *   Subsystems publishing rctls need not provide instances of all of the
108  *   functions specified by the ops vector.  In particular, if general
109  *   rctl_*() entry points are not being called, certain functions can be
110  *   omitted.  These align as follows:
111  *
112  *   rctl_set()
113  *     You may wish to provide a set callback if locking circumstances prevent
114  *     it or if the performance cost of requesting the enforced value from the
115  *     resource control is prohibitively expensive.  For instance, the currently
116  *     enforced file size limit is stored on the process in the p_fsz_ctl to
117  *     maintain read()/write() performance.
118  *
119  *   rctl_test()
120  *     You must provide a test callback if you are using the rctl_test()
121  *     interface.  An action callback is optional.
122  *
123  *   rctl_action()
124  *     You may wish to provide an action callback.
125  *
126  * Registration
127  *   New resource controls can be added to a running instance by loaded modules
128  *   via registration.  (The current implementation does not support unloadable
129  *   modules; this functionality can be added if needed, via an
130  *   activation/deactivation interface involving the manipulation of the
131  *   ops vector for the resource control(s) needing to support unloading.)
132  *
133  * Control value ordering
134  *   Because the rctl_val chain on each rctl must be navigable in a
135  *   deterministic way, we have to define an ordering on the rctl_val_t's.  The
136  *   defined order is (flags & [maximal], value, flags & [deny-action],
137  *   privilege).
138  *
139  * Locking
140  *   rctl_dict_lock must be acquired prior to rctl_lists_lock.  Since
141  *   rctl_dict_lock or rctl_lists_lock can be called at the enforcement point
142  *   of any subsystem, holding subsystem locks, it is at all times inappropriate
143  *   to call kmem_alloc(., KM_SLEEP) while holding either of these locks.
144  *   Traversing any of the various resource control entity lists requires
145  *   holding rctl_lists_lock.
146  *
147  *   Each individual resource control set associated with an entity must have
148  *   its rcs_lock held for the duration of any operations that would add
149  *   resource controls or control values to the set.
150  *
151  *   The locking subsequence of interest is: p_lock, rctl_dict_lock,
152  *   rctl_lists_lock, entity->rcs_lock.
153  */
154 
155 id_t max_rctl_hndl = 32768;
156 int rctl_dict_size = 64;
157 int rctl_set_size = 8;
158 kmutex_t rctl_dict_lock;
159 mod_hash_t *rctl_dict;
160 mod_hash_t *rctl_dict_by_name;
161 id_space_t *rctl_ids;
162 kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
163 kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
164 
165 kmutex_t rctl_lists_lock;
166 rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
167 
168 /*
169  * Default resource control operations and ops vector
170  *   To be used if the particular rcontrol has no specific actions defined, or
171  *   if the subsystem providing the control is quiescing (in preparation for
172  *   unloading, presumably.)
173  *
174  *   Resource controls with callbacks should fill the unused operations with the
175  *   appropriate default impotent callback.
176  */
177 /*ARGSUSED*/
178 void
179 rcop_no_action(struct rctl *r, struct proc *p, rctl_entity_p_t *e)
180 {
181 }
182 
183 /*ARGSUSED*/
184 rctl_qty_t
185 rcop_no_usage(struct rctl *r, struct proc *p)
186 {
187 	return (0);
188 }
189 
190 /*ARGSUSED*/
191 int
192 rcop_no_set(struct rctl *r, struct proc *p, rctl_entity_p_t *e, rctl_qty_t l)
193 {
194 	return (0);
195 }
196 
197 /*ARGSUSED*/
198 int
199 rcop_no_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
200     struct rctl_val *rv, rctl_qty_t i, uint_t f)
201 {
202 	return (0);
203 }
204 
205 rctl_ops_t rctl_default_ops = {
206 	rcop_no_action,
207 	rcop_no_usage,
208 	rcop_no_set,
209 	rcop_no_test
210 };
211 
212 /*
213  * Default "absolute" resource control operation and ops vector
214  *   Useful if there is no usage associated with the
215  *   resource control.
216  */
217 /*ARGSUSED*/
218 int
219 rcop_absolute_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
220     struct rctl_val *rv, rctl_qty_t i, uint_t f)
221 {
222 	return (i > rv->rcv_value);
223 }
224 
225 rctl_ops_t rctl_absolute_ops = {
226 	rcop_no_action,
227 	rcop_no_usage,
228 	rcop_no_set,
229 	rcop_absolute_test
230 };
231 
232 /*ARGSUSED*/
233 static uint_t
234 rctl_dict_hash_by_id(void *hash_data, mod_hash_key_t key)
235 {
236 	return ((uint_t)(uintptr_t)key % rctl_dict_size);
237 }
238 
239 static int
240 rctl_dict_id_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
241 {
242 	uint_t u1 = (uint_t)(uintptr_t)key1;
243 	uint_t u2 = (uint_t)(uintptr_t)key2;
244 
245 	if (u1 > u2)
246 		return (1);
247 
248 	if (u1 == u2)
249 		return (0);
250 
251 	return (-1);
252 }
253 
254 static void
255 rctl_dict_val_dtor(mod_hash_val_t val)
256 {
257 	rctl_dict_entry_t *kr = (rctl_dict_entry_t *)val;
258 
259 	kmem_free(kr, sizeof (rctl_dict_entry_t));
260 }
261 
262 /*
263  * size_t rctl_build_name_buf()
264  *
265  * Overview
266  *   rctl_build_name_buf() walks all active resource controls in the dictionary,
267  *   building a buffer of continguous NUL-terminated strings.
268  *
269  * Return values
270  *   The size of the buffer is returned, the passed pointer's contents are
271  *   modified to that of the location of the buffer.
272  *
273  * Caller's context
274  *   Caller must be in a context suitable for KM_SLEEP allocations.
275  */
276 size_t
277 rctl_build_name_buf(char **rbufp)
278 {
279 	size_t req_size, cpy_size;
280 	char *rbufloc;
281 	int i;
282 
283 rctl_rebuild_name_buf:
284 	req_size = cpy_size = 0;
285 
286 	/*
287 	 * Calculate needed buffer length.
288 	 */
289 	mutex_enter(&rctl_lists_lock);
290 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
291 		rctl_dict_entry_t *rde;
292 
293 		for (rde = rctl_lists[i];
294 		    rde != NULL;
295 		    rde = rde->rcd_next)
296 			req_size += strlen(rde->rcd_name) + 1;
297 	}
298 	mutex_exit(&rctl_lists_lock);
299 
300 	rbufloc = *rbufp = kmem_alloc(req_size, KM_SLEEP);
301 
302 	/*
303 	 * Copy rctl names into our buffer.  If the copy length exceeds the
304 	 * allocate length (due to registration changes), stop copying, free the
305 	 * buffer, and start again.
306 	 */
307 	mutex_enter(&rctl_lists_lock);
308 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
309 		rctl_dict_entry_t *rde;
310 
311 		for (rde = rctl_lists[i];
312 		    rde != NULL;
313 		    rde = rde->rcd_next) {
314 			size_t length = strlen(rde->rcd_name) + 1;
315 
316 			cpy_size += length;
317 
318 			if (cpy_size > req_size) {
319 				kmem_free(*rbufp, req_size);
320 				mutex_exit(&rctl_lists_lock);
321 				goto rctl_rebuild_name_buf;
322 			}
323 
324 			bcopy(rde->rcd_name, rbufloc, length);
325 			rbufloc += length;
326 		}
327 	}
328 	mutex_exit(&rctl_lists_lock);
329 
330 	return (req_size);
331 }
332 
333 /*
334  * rctl_dict_entry_t *rctl_dict_lookup(const char *)
335  *
336  * Overview
337  *   rctl_dict_lookup() returns the resource control dictionary entry for the
338  *   named resource control.
339  *
340  * Return values
341  *   A pointer to the appropriate resource control dictionary entry, or NULL if
342  *   no such named entry exists.
343  *
344  * Caller's context
345  *   Caller must not be holding rctl_dict_lock.
346  */
347 rctl_dict_entry_t *
348 rctl_dict_lookup(const char *name)
349 {
350 	rctl_dict_entry_t *rde;
351 
352 	mutex_enter(&rctl_dict_lock);
353 
354 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
355 	    (mod_hash_val_t *)&rde) == MH_ERR_NOTFOUND) {
356 		mutex_exit(&rctl_dict_lock);
357 		return (NULL);
358 	}
359 
360 	mutex_exit(&rctl_dict_lock);
361 
362 	return (rde);
363 }
364 
365 /*
366  * rctl_hndl_t rctl_hndl_lookup(const char *)
367  *
368  * Overview
369  *   rctl_hndl_lookup() returns the resource control id (the "handle") for the
370  *   named resource control.
371  *
372  * Return values
373  *   The appropriate id, or -1 if no such named entry exists.
374  *
375  * Caller's context
376  *   Caller must not be holding rctl_dict_lock.
377  */
378 rctl_hndl_t
379 rctl_hndl_lookup(const char *name)
380 {
381 	rctl_dict_entry_t *rde;
382 
383 	if ((rde = rctl_dict_lookup(name)) == NULL)
384 		return (-1);
385 
386 	return (rde->rcd_id);
387 }
388 
389 /*
390  * rctl_dict_entry_t * rctl_dict_lookup_hndl(rctl_hndl_t)
391  *
392  * Overview
393  *   rctl_dict_lookup_hndl() completes the public lookup functions, by returning
394  *   the resource control dictionary entry matching a given resource control id.
395  *
396  * Return values
397  *   A pointer to the matching resource control dictionary entry, or NULL if the
398  *   id does not match any existing entries.
399  *
400  * Caller's context
401  *   Caller must not be holding rctl_lists_lock.
402  */
403 rctl_dict_entry_t *
404 rctl_dict_lookup_hndl(rctl_hndl_t hndl)
405 {
406 	uint_t i;
407 
408 	mutex_enter(&rctl_lists_lock);
409 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
410 		rctl_dict_entry_t *rde;
411 
412 		for (rde = rctl_lists[i];
413 		    rde != NULL;
414 		    rde = rde->rcd_next)
415 			if (rde->rcd_id == hndl) {
416 				mutex_exit(&rctl_lists_lock);
417 				return (rde);
418 			}
419 	}
420 	mutex_exit(&rctl_lists_lock);
421 
422 	return (NULL);
423 }
424 
425 /*
426  * void rctl_add_default_limit(const char *name, rctl_qty_t value,
427  *     rctl_priv_t privilege, uint_t action)
428  *
429  * Overview
430  *   Create a default limit with specified value, privilege, and action.
431  *
432  * Return value
433  *   No value returned.
434  */
435 void
436 rctl_add_default_limit(const char *name, rctl_qty_t value,
437     rctl_priv_t privilege, uint_t action)
438 {
439 	rctl_val_t *dval;
440 	rctl_dict_entry_t *rde;
441 
442 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
443 	bzero(dval, sizeof (rctl_val_t));
444 	dval->rcv_value = value;
445 	dval->rcv_privilege = privilege;
446 	dval->rcv_flagaction = action;
447 	dval->rcv_action_recip_pid = -1;
448 
449 	rde = rctl_dict_lookup(name);
450 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
451 }
452 
453 /*
454  * void rctl_add_legacy_limit(const char *name, const char *mname,
455  *     const char *lname, rctl_qty_t dflt)
456  *
457  * Overview
458  *   Create a default privileged limit, using the value obtained from
459  *   /etc/system if it exists and is greater than the specified default
460  *   value.  Exists primarily for System V IPC.
461  *
462  * Return value
463  *   No value returned.
464  */
465 void
466 rctl_add_legacy_limit(const char *name, const char *mname, const char *lname,
467     rctl_qty_t dflt, rctl_qty_t max)
468 {
469 	rctl_qty_t qty;
470 
471 	if (!mod_sysvar(mname, lname, &qty) || (qty < dflt))
472 		qty = dflt;
473 
474 	if (qty > max)
475 		qty = max;
476 
477 	rctl_add_default_limit(name, qty, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
478 }
479 
480 static rctl_set_t *
481 rctl_entity_obtain_rset(rctl_dict_entry_t *rcd, struct proc *p)
482 {
483 	rctl_set_t *rset = NULL;
484 
485 	if (rcd == NULL)
486 		return (NULL);
487 
488 	switch (rcd->rcd_entity) {
489 	case RCENTITY_PROCESS:
490 		rset = p->p_rctls;
491 		break;
492 	case RCENTITY_TASK:
493 		ASSERT(MUTEX_HELD(&p->p_lock));
494 		if (p->p_task != NULL)
495 			rset = p->p_task->tk_rctls;
496 		break;
497 	case RCENTITY_PROJECT:
498 		ASSERT(MUTEX_HELD(&p->p_lock));
499 		if (p->p_task != NULL &&
500 		    p->p_task->tk_proj != NULL)
501 			rset = p->p_task->tk_proj->kpj_rctls;
502 		break;
503 	case RCENTITY_ZONE:
504 		ASSERT(MUTEX_HELD(&p->p_lock));
505 		if (p->p_zone != NULL)
506 			rset = p->p_zone->zone_rctls;
507 		break;
508 	default:
509 		panic("unknown rctl entity type %d seen", rcd->rcd_entity);
510 		break;
511 	}
512 
513 	return (rset);
514 }
515 
516 static void
517 rctl_entity_obtain_entity_p(rctl_entity_t entity, struct proc *p,
518     rctl_entity_p_t *e)
519 {
520 	e->rcep_p.proc = NULL;
521 	e->rcep_t = entity;
522 
523 	switch (entity) {
524 	case RCENTITY_PROCESS:
525 		e->rcep_p.proc = p;
526 		break;
527 	case RCENTITY_TASK:
528 		ASSERT(MUTEX_HELD(&p->p_lock));
529 		if (p->p_task != NULL)
530 			e->rcep_p.task = p->p_task;
531 		break;
532 	case RCENTITY_PROJECT:
533 		ASSERT(MUTEX_HELD(&p->p_lock));
534 		if (p->p_task != NULL &&
535 		    p->p_task->tk_proj != NULL)
536 			e->rcep_p.proj = p->p_task->tk_proj;
537 		break;
538 	case RCENTITY_ZONE:
539 		ASSERT(MUTEX_HELD(&p->p_lock));
540 		if (p->p_zone != NULL)
541 			e->rcep_p.zone = p->p_zone;
542 		break;
543 	default:
544 		panic("unknown rctl entity type %d seen", entity);
545 		break;
546 	}
547 }
548 
549 static void
550 rctl_gp_alloc(rctl_alloc_gp_t *rcgp)
551 {
552 	uint_t i;
553 
554 	if (rcgp->rcag_nctls > 0) {
555 		rctl_t *prev = kmem_cache_alloc(rctl_cache, KM_SLEEP);
556 		rctl_t *rctl = prev;
557 
558 		rcgp->rcag_ctls = prev;
559 
560 		for (i = 1; i < rcgp->rcag_nctls; i++) {
561 			rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
562 			prev->rc_next = rctl;
563 			prev = rctl;
564 		}
565 
566 		rctl->rc_next = NULL;
567 	}
568 
569 	if (rcgp->rcag_nvals > 0) {
570 		rctl_val_t *prev = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
571 		rctl_val_t *rval = prev;
572 
573 		rcgp->rcag_vals = prev;
574 
575 		for (i = 1; i < rcgp->rcag_nvals; i++) {
576 			rval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
577 			prev->rcv_next = rval;
578 			prev = rval;
579 		}
580 
581 		rval->rcv_next = NULL;
582 	}
583 
584 }
585 
586 static rctl_val_t *
587 rctl_gp_detach_val(rctl_alloc_gp_t *rcgp)
588 {
589 	rctl_val_t *rval = rcgp->rcag_vals;
590 
591 	ASSERT(rcgp->rcag_nvals > 0);
592 	rcgp->rcag_nvals--;
593 	rcgp->rcag_vals = rval->rcv_next;
594 
595 	rval->rcv_next = NULL;
596 
597 	return (rval);
598 }
599 
600 static rctl_t *
601 rctl_gp_detach_ctl(rctl_alloc_gp_t *rcgp)
602 {
603 	rctl_t *rctl = rcgp->rcag_ctls;
604 
605 	ASSERT(rcgp->rcag_nctls > 0);
606 	rcgp->rcag_nctls--;
607 	rcgp->rcag_ctls = rctl->rc_next;
608 
609 	rctl->rc_next = NULL;
610 
611 	return (rctl);
612 
613 }
614 
615 static void
616 rctl_gp_free(rctl_alloc_gp_t *rcgp)
617 {
618 	rctl_val_t *rval = rcgp->rcag_vals;
619 	rctl_t *rctl = rcgp->rcag_ctls;
620 
621 	while (rval != NULL) {
622 		rctl_val_t *next = rval->rcv_next;
623 
624 		kmem_cache_free(rctl_val_cache, rval);
625 		rval = next;
626 	}
627 
628 	while (rctl != NULL) {
629 		rctl_t *next = rctl->rc_next;
630 
631 		kmem_cache_free(rctl_cache, rctl);
632 		rctl = next;
633 	}
634 }
635 
636 /*
637  * void rctl_prealloc_destroy(rctl_alloc_gp_t *)
638  *
639  * Overview
640  *   Release all unused memory allocated via one of the "prealloc" functions:
641  *   rctl_set_init_prealloc, rctl_set_dup_prealloc, or rctl_rlimit_set_prealloc.
642  *
643  * Return values
644  *   None.
645  *
646  * Caller's context
647  *   No restrictions on context.
648  */
649 void
650 rctl_prealloc_destroy(rctl_alloc_gp_t *gp)
651 {
652 	rctl_gp_free(gp);
653 	kmem_free(gp, sizeof (rctl_alloc_gp_t));
654 }
655 
656 /*
657  * int rctl_val_cmp(rctl_val_t *, rctl_val_t *, int)
658  *
659  * Overview
660  *   This function defines an ordering to rctl_val_t's in order to allow
661  *   for correct placement in value lists. When the imprecise flag is set,
662  *   the action recipient is ignored. This is to facilitate insert,
663  *   delete, and replace operations by rctlsys.
664  *
665  * Return values
666  *   0 if the val_t's are are considered identical
667  *   -1 if a is ordered lower than b
668  *   1 if a is lowered higher than b
669  *
670  * Caller's context
671  *   No restrictions on context.
672  */
673 int
674 rctl_val_cmp(rctl_val_t *a, rctl_val_t *b, int imprecise)
675 {
676 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) <
677 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
678 		return (-1);
679 
680 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) >
681 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
682 		return (1);
683 
684 	if (a->rcv_value < b->rcv_value)
685 		return (-1);
686 
687 	if (a->rcv_value > b->rcv_value)
688 		return (1);
689 
690 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) <
691 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
692 		return (-1);
693 
694 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) >
695 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
696 		return (1);
697 
698 	if (a->rcv_privilege < b->rcv_privilege)
699 		return (-1);
700 
701 	if (a->rcv_privilege > b->rcv_privilege)
702 		return (1);
703 
704 	if (imprecise)
705 		return (0);
706 
707 	if (a->rcv_action_recip_pid < b->rcv_action_recip_pid)
708 		return (-1);
709 
710 	if (a->rcv_action_recip_pid > b->rcv_action_recip_pid)
711 		return (1);
712 
713 	return (0);
714 }
715 
716 static rctl_val_t *
717 rctl_val_list_find(rctl_val_t **head, rctl_val_t *cval)
718 {
719 	rctl_val_t *rval = *head;
720 
721 	while (rval != NULL) {
722 		if (rctl_val_cmp(cval, rval, 0) == 0)
723 			return (rval);
724 
725 		rval = rval->rcv_next;
726 	}
727 
728 	return (NULL);
729 
730 }
731 
732 /*
733  * int rctl_val_list_insert(rctl_val_t **, rctl_val_t *)
734  *
735  * Overview
736  *   This function inserts the rctl_val_t into the value list provided.
737  *   The insert is always successful unless if the value is a duplicate
738  *   of one already in the list.
739  *
740  * Return values
741  *    1 if the value was a duplicate of an existing value in the list.
742  *    0 if the insert was successful.
743  */
744 int
745 rctl_val_list_insert(rctl_val_t **root, rctl_val_t *rval)
746 {
747 	rctl_val_t *prev;
748 	int equiv;
749 
750 	rval->rcv_next = NULL;
751 	rval->rcv_prev = NULL;
752 
753 	if (*root == NULL) {
754 		*root = rval;
755 		return (0);
756 	}
757 
758 	equiv = rctl_val_cmp(rval, *root, 0);
759 
760 	if (equiv == 0)
761 		return (1);
762 
763 	if (equiv < 0) {
764 		rval->rcv_next = *root;
765 		rval->rcv_next->rcv_prev = rval;
766 		*root = rval;
767 
768 		return (0);
769 	}
770 
771 	prev = *root;
772 	while (prev->rcv_next != NULL &&
773 	    (equiv = rctl_val_cmp(rval, prev->rcv_next, 0)) > 0) {
774 		prev = prev->rcv_next;
775 	}
776 
777 	if (equiv == 0)
778 		return (1);
779 
780 	rval->rcv_next = prev->rcv_next;
781 	if (rval->rcv_next != NULL)
782 		rval->rcv_next->rcv_prev = rval;
783 	prev->rcv_next = rval;
784 	rval->rcv_prev = prev;
785 
786 	return (0);
787 }
788 
789 static int
790 rctl_val_list_delete(rctl_val_t **root, rctl_val_t *rval)
791 {
792 	rctl_val_t *prev;
793 
794 	if (*root == NULL)
795 		return (-1);
796 
797 	prev = *root;
798 	if (rctl_val_cmp(rval, prev, 0) == 0) {
799 		*root = prev->rcv_next;
800 		(*root)->rcv_prev = NULL;
801 
802 		kmem_cache_free(rctl_val_cache, prev);
803 
804 		return (0);
805 	}
806 
807 	while (prev->rcv_next != NULL &&
808 	    rctl_val_cmp(rval, prev->rcv_next, 0) != 0) {
809 		prev = prev->rcv_next;
810 	}
811 
812 	if (prev->rcv_next == NULL) {
813 		/*
814 		 * If we navigate the entire list and cannot find a match, then
815 		 * return failure.
816 		 */
817 		return (-1);
818 	}
819 
820 	prev = prev->rcv_next;
821 	prev->rcv_prev->rcv_next = prev->rcv_next;
822 	if (prev->rcv_next != NULL)
823 		prev->rcv_next->rcv_prev = prev->rcv_prev;
824 
825 	kmem_cache_free(rctl_val_cache, prev);
826 
827 	return (0);
828 }
829 
830 static rctl_val_t *
831 rctl_val_list_dup(rctl_val_t *rval, rctl_alloc_gp_t *ragp, struct proc *oldp,
832     struct proc *newp)
833 {
834 	rctl_val_t *head = NULL;
835 
836 	for (; rval != NULL; rval = rval->rcv_next) {
837 		rctl_val_t *dval = rctl_gp_detach_val(ragp);
838 
839 		bcopy(rval, dval, sizeof (rctl_val_t));
840 		dval->rcv_prev = dval->rcv_next = NULL;
841 
842 		if (oldp == NULL ||
843 		    rval->rcv_action_recipient == NULL ||
844 		    rval->rcv_action_recipient == oldp) {
845 			if (rval->rcv_privilege == RCPRIV_BASIC) {
846 				dval->rcv_action_recipient = newp;
847 				dval->rcv_action_recip_pid = newp->p_pid;
848 			} else {
849 				dval->rcv_action_recipient = NULL;
850 				dval->rcv_action_recip_pid = -1;
851 			}
852 
853 			(void) rctl_val_list_insert(&head, dval);
854 		} else {
855 			kmem_cache_free(rctl_val_cache, dval);
856 		}
857 	}
858 
859 	return (head);
860 }
861 
862 static void
863 rctl_val_list_reset(rctl_val_t *rval)
864 {
865 	for (; rval != NULL; rval = rval->rcv_next)
866 		rval->rcv_firing_time = 0;
867 }
868 
869 static uint_t
870 rctl_val_list_count(rctl_val_t *rval)
871 {
872 	uint_t n = 0;
873 
874 	for (; rval != NULL; rval = rval->rcv_next)
875 		n++;
876 
877 	return (n);
878 }
879 
880 
881 static void
882 rctl_val_list_free(rctl_val_t *rval)
883 {
884 	while (rval != NULL) {
885 		rctl_val_t *next = rval->rcv_next;
886 
887 		kmem_cache_free(rctl_val_cache, rval);
888 
889 		rval = next;
890 	}
891 }
892 
893 /*
894  * rctl_qty_t rctl_model_maximum(rctl_dict_entry_t *, struct proc *)
895  *
896  * Overview
897  *   In cases where the operating system supports more than one process
898  *   addressing model, the operating system capabilities will exceed those of
899  *   one or more of these models.  Processes in a less capable model must have
900  *   their resources accurately controlled, without diluting those of their
901  *   descendants reached via exec().  rctl_model_maximum() returns the governing
902  *   value for the specified process with respect to a resource control, such
903  *   that the value can used for the RCTLOP_SET callback or compatability
904  *   support.
905  *
906  * Return values
907  *   The maximum value for the given process for the specified resource control.
908  *
909  * Caller's context
910  *   No restrictions on context.
911  */
912 rctl_qty_t
913 rctl_model_maximum(rctl_dict_entry_t *rde, struct proc *p)
914 {
915 	if (p->p_model == DATAMODEL_NATIVE)
916 		return (rde->rcd_max_native);
917 
918 	return (rde->rcd_max_ilp32);
919 }
920 
921 /*
922  * rctl_qty_t rctl_model_value(rctl_dict_entry_t *, struct proc *, rctl_qty_t)
923  *
924  * Overview
925  *   Convenience function wrapping the rctl_model_maximum() functionality.
926  *
927  * Return values
928  *   The lesser of the process's maximum value and the given value for the
929  *   specified resource control.
930  *
931  * Caller's context
932  *   No restrictions on context.
933  */
934 rctl_qty_t
935 rctl_model_value(rctl_dict_entry_t *rde, struct proc *p, rctl_qty_t value)
936 {
937 	rctl_qty_t max = rctl_model_maximum(rde, p);
938 
939 	return (value < max ? value : max);
940 }
941 
942 static void
943 rctl_set_insert(rctl_set_t *set, rctl_hndl_t hndl, rctl_t *rctl)
944 {
945 	uint_t index = hndl % rctl_set_size;
946 	rctl_t *next_ctl, *prev_ctl;
947 
948 	ASSERT(MUTEX_HELD(&set->rcs_lock));
949 
950 	rctl->rc_next = NULL;
951 
952 	if (set->rcs_ctls[index] == NULL) {
953 		set->rcs_ctls[index] = rctl;
954 		return;
955 	}
956 
957 	if (hndl < set->rcs_ctls[index]->rc_id) {
958 		rctl->rc_next = set->rcs_ctls[index];
959 		set->rcs_ctls[index] = rctl;
960 
961 		return;
962 	}
963 
964 	for (next_ctl = set->rcs_ctls[index]->rc_next,
965 	    prev_ctl = set->rcs_ctls[index];
966 	    next_ctl != NULL;
967 	    prev_ctl = next_ctl,
968 	    next_ctl = next_ctl->rc_next) {
969 		if (next_ctl->rc_id > hndl) {
970 			rctl->rc_next = next_ctl;
971 			prev_ctl->rc_next = rctl;
972 
973 			return;
974 		}
975 	}
976 
977 	rctl->rc_next = next_ctl;
978 	prev_ctl->rc_next = rctl;
979 }
980 
981 /*
982  * rctl_set_t *rctl_set_create()
983  *
984  * Overview
985  *   Create an empty resource control set, suitable for attaching to a
986  *   controlled entity.
987  *
988  * Return values
989  *   A pointer to the newly created set.
990  *
991  * Caller's context
992  *   Safe for KM_SLEEP allocations.
993  */
994 rctl_set_t *
995 rctl_set_create()
996 {
997 	rctl_set_t *rset = kmem_zalloc(sizeof (rctl_set_t), KM_SLEEP);
998 
999 	mutex_init(&rset->rcs_lock, NULL, MUTEX_DEFAULT, NULL);
1000 	rset->rcs_ctls = kmem_zalloc(rctl_set_size * sizeof (rctl_t *),
1001 	    KM_SLEEP);
1002 	rset->rcs_entity = -1;
1003 
1004 	return (rset);
1005 }
1006 
1007 /*
1008  * rctl_gp_alloc_t *rctl_set_init_prealloc(rctl_entity_t)
1009  *
1010  * Overview
1011  *    rctl_set_init_prealloc() examines the globally defined resource controls
1012  *    and their default values and returns a resource control allocation group
1013  *    populated with sufficient controls and values to form a representative
1014  *    resource control set for the specified entity.
1015  *
1016  * Return values
1017  *    A pointer to the newly created allocation group.
1018  *
1019  * Caller's context
1020  *    Caller must be in a context suitable for KM_SLEEP allocations.
1021  */
1022 rctl_alloc_gp_t *
1023 rctl_set_init_prealloc(rctl_entity_t entity)
1024 {
1025 	rctl_dict_entry_t *rde;
1026 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1027 
1028 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1029 
1030 	if (rctl_lists[entity] == NULL)
1031 		return (ragp);
1032 
1033 	mutex_enter(&rctl_lists_lock);
1034 
1035 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1036 		ragp->rcag_nctls++;
1037 		ragp->rcag_nvals += rctl_val_list_count(rde->rcd_default_value);
1038 	}
1039 
1040 	mutex_exit(&rctl_lists_lock);
1041 
1042 	rctl_gp_alloc(ragp);
1043 
1044 	return (ragp);
1045 }
1046 
1047 /*
1048  * rctl_set_t *rctl_set_init(rctl_entity_t)
1049  *
1050  * Overview
1051  *   rctl_set_create() creates a resource control set, initialized with the
1052  *   system infinite values on all registered controls, for attachment to a
1053  *   system entity requiring resource controls, such as a process or a task.
1054  *
1055  * Return values
1056  *   A pointer to the newly filled set.
1057  *
1058  * Caller's context
1059  *   Caller must be holding p_lock on entry so that RCTLOP_SET() functions
1060  *   may modify task and project members based on the proc structure
1061  *   they are passed.
1062  */
1063 rctl_set_t *
1064 rctl_set_init(rctl_entity_t entity, struct proc *p, rctl_entity_p_t *e,
1065     rctl_set_t *rset, rctl_alloc_gp_t *ragp)
1066 {
1067 	rctl_dict_entry_t *rde;
1068 
1069 	ASSERT(MUTEX_HELD(&p->p_lock));
1070 	ASSERT(e);
1071 	rset->rcs_entity = entity;
1072 
1073 	if (rctl_lists[entity] == NULL)
1074 		return (rset);
1075 
1076 	mutex_enter(&rctl_lists_lock);
1077 	mutex_enter(&rset->rcs_lock);
1078 
1079 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1080 		rctl_t *rctl = rctl_gp_detach_ctl(ragp);
1081 
1082 		rctl->rc_dict_entry = rde;
1083 		rctl->rc_id = rde->rcd_id;
1084 
1085 		rctl->rc_values = rctl_val_list_dup(rde->rcd_default_value,
1086 		    ragp, NULL, p);
1087 		rctl->rc_cursor = rctl->rc_values;
1088 
1089 		ASSERT(rctl->rc_cursor != NULL);
1090 
1091 		rctl_set_insert(rset, rde->rcd_id, rctl);
1092 
1093 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1094 		    rctl->rc_cursor->rcv_value));
1095 	}
1096 
1097 	mutex_exit(&rset->rcs_lock);
1098 	mutex_exit(&rctl_lists_lock);
1099 
1100 	return (rset);
1101 }
1102 
1103 static rctl_t *
1104 rctl_dup(rctl_t *rctl, rctl_alloc_gp_t *ragp, struct proc *oldp,
1105     struct proc *newp)
1106 {
1107 	rctl_t *dup = rctl_gp_detach_ctl(ragp);
1108 	rctl_val_t *dval;
1109 
1110 	dup->rc_id = rctl->rc_id;
1111 	dup->rc_dict_entry = rctl->rc_dict_entry;
1112 	dup->rc_next = NULL;
1113 	dup->rc_cursor = NULL;
1114 	dup->rc_values = rctl_val_list_dup(rctl->rc_values, ragp, oldp, newp);
1115 
1116 	for (dval = dup->rc_values;
1117 	    dval != NULL; dval = dval->rcv_next) {
1118 		if (rctl_val_cmp(rctl->rc_cursor, dval, 0) >= 0) {
1119 			dup->rc_cursor = dval;
1120 			break;
1121 		}
1122 	}
1123 
1124 	if (dup->rc_cursor == NULL)
1125 		dup->rc_cursor = dup->rc_values;
1126 
1127 	return (dup);
1128 }
1129 
1130 static void
1131 rctl_set_fill_alloc_gp(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1132 {
1133 	uint_t i;
1134 
1135 	bzero(ragp, sizeof (rctl_alloc_gp_t));
1136 
1137 	for (i = 0; i < rctl_set_size; i++) {
1138 		rctl_t *r = set->rcs_ctls[i];
1139 
1140 		while (r != NULL) {
1141 			ragp->rcag_nctls++;
1142 
1143 			ragp->rcag_nvals += rctl_val_list_count(r->rc_values);
1144 
1145 			r = r->rc_next;
1146 		}
1147 	}
1148 }
1149 
1150 /*
1151  * rctl_alloc_gp_t *rctl_set_dup_prealloc(rctl_set_t *)
1152  *
1153  * Overview
1154  *   Given a resource control set, allocate a sufficiently large allocation
1155  *   group to contain a duplicate of the set.
1156  *
1157  * Return value
1158  *   A pointer to the newly created allocation group.
1159  *
1160  * Caller's context
1161  *   Safe for KM_SLEEP allocations.
1162  */
1163 rctl_alloc_gp_t *
1164 rctl_set_dup_prealloc(rctl_set_t *set)
1165 {
1166 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1167 
1168 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1169 
1170 	mutex_enter(&set->rcs_lock);
1171 	rctl_set_fill_alloc_gp(set, ragp);
1172 	mutex_exit(&set->rcs_lock);
1173 
1174 	rctl_gp_alloc(ragp);
1175 
1176 	return (ragp);
1177 }
1178 
1179 /*
1180  * int rctl_set_dup_ready(rctl_set_t *, rctl_alloc_gp_t *)
1181  *
1182  * Overview
1183  *   Verify that the allocation group provided is large enough to allow a
1184  *   duplicate of the given resource control set to be constructed from its
1185  *   contents.
1186  *
1187  * Return values
1188  *   1 if the allocation group is sufficiently large, 0 otherwise.
1189  *
1190  * Caller's context
1191  *   rcs_lock must be held prior to entry.
1192  */
1193 int
1194 rctl_set_dup_ready(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1195 {
1196 	rctl_alloc_gp_t curr_gp;
1197 
1198 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1199 
1200 	rctl_set_fill_alloc_gp(set, &curr_gp);
1201 
1202 	if (curr_gp.rcag_nctls <= ragp->rcag_nctls &&
1203 	    curr_gp.rcag_nvals <= ragp->rcag_nvals)
1204 		return (1);
1205 
1206 	return (0);
1207 }
1208 
1209 /*
1210  * rctl_set_t *rctl_set_dup(rctl_set_t *, struct proc *, struct proc *,
1211  *   rctl_set_t *, rctl_alloc_gp_t *, int)
1212  *
1213  * Overview
1214  *   Make a duplicate of the resource control set.  The proc pointers are those
1215  *   of the owning process and of the process associated with the entity
1216  *   receiving the duplicate.
1217  *
1218  *   Duplication is a 3 stage process. Stage 1 is memory allocation for
1219  *   the duplicate set, which is taken care of by rctl_set_dup_prealloc().
1220  *   Stage 2 consists of copying all rctls and values from the old set into
1221  *   the new. Stage 3 completes the duplication by performing the appropriate
1222  *   callbacks for each rctl in the new set.
1223  *
1224  *   Stages 2 and 3 are handled by calling rctl_set_dup with the RCD_DUP and
1225  *   RCD_CALLBACK functions, respectively. The RCD_CALLBACK flag may only
1226  *   be supplied if the newp proc structure reflects the new task and
1227  *   project linkage.
1228  *
1229  * Return value
1230  *   A pointer to the duplicate set.
1231  *
1232  * Caller's context
1233  *   The rcs_lock of the set to be duplicated must be held prior to entry.
1234  */
1235 rctl_set_t *
1236 rctl_set_dup(rctl_set_t *set, struct proc *oldp, struct proc *newp,
1237     rctl_entity_p_t *e, rctl_set_t *dup, rctl_alloc_gp_t *ragp, int flag)
1238 {
1239 	uint_t i;
1240 	rctl_set_t	*iter;
1241 
1242 	ASSERT((flag & RCD_DUP) || (flag & RCD_CALLBACK));
1243 	ASSERT(e);
1244 	/*
1245 	 * When copying the old set, iterate over that. Otherwise, when
1246 	 * only callbacks have been requested, iterate over the dup set.
1247 	 */
1248 	if (flag & RCD_DUP) {
1249 		ASSERT(MUTEX_HELD(&set->rcs_lock));
1250 		iter = set;
1251 		dup->rcs_entity = set->rcs_entity;
1252 	} else {
1253 		iter = dup;
1254 	}
1255 
1256 	mutex_enter(&dup->rcs_lock);
1257 
1258 	for (i = 0; i < rctl_set_size; i++) {
1259 		rctl_t *r = iter->rcs_ctls[i];
1260 		rctl_t *d;
1261 
1262 		while (r != NULL) {
1263 			if (flag & RCD_DUP) {
1264 				d = rctl_dup(r, ragp, oldp, newp);
1265 				rctl_set_insert(dup, r->rc_id, d);
1266 			} else {
1267 				d = r;
1268 			}
1269 
1270 			if (flag & RCD_CALLBACK)
1271 				RCTLOP_SET(d, newp, e,
1272 				    rctl_model_value(d->rc_dict_entry, newp,
1273 				    d->rc_cursor->rcv_value));
1274 
1275 			r = r->rc_next;
1276 		}
1277 	}
1278 
1279 	mutex_exit(&dup->rcs_lock);
1280 
1281 	return (dup);
1282 }
1283 
1284 /*
1285  * void rctl_set_free(rctl_set_t *)
1286  *
1287  * Overview
1288  *   Delete resource control set and all attached values.
1289  *
1290  * Return values
1291  *   No value returned.
1292  *
1293  * Caller's context
1294  *   No restrictions on context.
1295  */
1296 void
1297 rctl_set_free(rctl_set_t *set)
1298 {
1299 	uint_t i;
1300 
1301 	mutex_enter(&set->rcs_lock);
1302 	for (i = 0; i < rctl_set_size; i++) {
1303 		rctl_t *r = set->rcs_ctls[i];
1304 
1305 		while (r != NULL) {
1306 			rctl_val_t *v = r->rc_values;
1307 			rctl_t *n = r->rc_next;
1308 
1309 			kmem_cache_free(rctl_cache, r);
1310 
1311 			rctl_val_list_free(v);
1312 
1313 			r = n;
1314 		}
1315 	}
1316 	mutex_exit(&set->rcs_lock);
1317 
1318 	kmem_free(set->rcs_ctls, sizeof (rctl_t *) * rctl_set_size);
1319 	kmem_free(set, sizeof (rctl_set_t));
1320 }
1321 
1322 /*
1323  * void rctl_set_reset(rctl_set_t *)
1324  *
1325  * Overview
1326  *   Resets all rctls within the set such that the lowest value becomes active.
1327  *
1328  * Return values
1329  *   No value returned.
1330  *
1331  * Caller's context
1332  *   No restrictions on context.
1333  */
1334 void
1335 rctl_set_reset(rctl_set_t *set, struct proc *p, rctl_entity_p_t *e)
1336 {
1337 	uint_t i;
1338 
1339 	ASSERT(e);
1340 
1341 	mutex_enter(&set->rcs_lock);
1342 	for (i = 0; i < rctl_set_size; i++) {
1343 		rctl_t *r = set->rcs_ctls[i];
1344 
1345 		while (r != NULL) {
1346 			r->rc_cursor = r->rc_values;
1347 			rctl_val_list_reset(r->rc_cursor);
1348 			RCTLOP_SET(r, p, e, rctl_model_value(r->rc_dict_entry,
1349 			    p, r->rc_cursor->rcv_value));
1350 
1351 			ASSERT(r->rc_cursor != NULL);
1352 
1353 			r = r->rc_next;
1354 		}
1355 	}
1356 
1357 	mutex_exit(&set->rcs_lock);
1358 }
1359 
1360 /*
1361  * void rctl_set_tearoff(rctl_set *, struct proc *)
1362  *
1363  * Overview
1364  *   Tear off any resource control values on this set with an action recipient
1365  *   equal to the specified process (as they are becoming invalid with the
1366  *   process's departure from this set as an observer).
1367  *
1368  * Return values
1369  *   No value returned.
1370  *
1371  * Caller's context
1372  *   No restrictions on context
1373  */
1374 void
1375 rctl_set_tearoff(rctl_set_t *set, struct proc *p)
1376 {
1377 	uint_t i;
1378 
1379 	mutex_enter(&set->rcs_lock);
1380 	for (i = 0; i < rctl_set_size; i++) {
1381 		rctl_t *r = set->rcs_ctls[i];
1382 
1383 		while (r != NULL) {
1384 			rctl_val_t *rval;
1385 
1386 tearoff_rewalk_list:
1387 			rval = r->rc_values;
1388 
1389 			while (rval != NULL) {
1390 				if (rval->rcv_privilege == RCPRIV_BASIC &&
1391 				    rval->rcv_action_recipient == p) {
1392 					if (r->rc_cursor == rval)
1393 						r->rc_cursor = rval->rcv_next;
1394 
1395 					(void) rctl_val_list_delete(
1396 					    &r->rc_values, rval);
1397 
1398 					goto tearoff_rewalk_list;
1399 				}
1400 
1401 				rval = rval->rcv_next;
1402 			}
1403 
1404 			ASSERT(r->rc_cursor != NULL);
1405 
1406 			r = r->rc_next;
1407 		}
1408 	}
1409 
1410 	mutex_exit(&set->rcs_lock);
1411 }
1412 
1413 static int
1414 rctl_set_find(rctl_set_t *set, rctl_hndl_t hndl, rctl_t **rctl)
1415 {
1416 	uint_t index = hndl % rctl_set_size;
1417 	rctl_t *curr_ctl;
1418 
1419 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1420 
1421 	for (curr_ctl = set->rcs_ctls[index]; curr_ctl != NULL;
1422 	    curr_ctl = curr_ctl->rc_next) {
1423 		if (curr_ctl->rc_id == hndl) {
1424 			*rctl = curr_ctl;
1425 
1426 			return (0);
1427 		}
1428 	}
1429 
1430 	return (-1);
1431 }
1432 
1433 /*
1434  * rlim64_t rctl_enforced_value(rctl_hndl_t, rctl_set_t *, struct proc *)
1435  *
1436  * Overview
1437  *   Given a process, get the next enforced value on the rctl of the specified
1438  *   handle.
1439  *
1440  * Return value
1441  *   The enforced value.
1442  *
1443  * Caller's context
1444  *   For controls on process collectives, p->p_lock must be held across the
1445  *   operation.
1446  */
1447 /*ARGSUSED*/
1448 rctl_qty_t
1449 rctl_enforced_value(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p)
1450 {
1451 	rctl_t *rctl;
1452 	rlim64_t ret;
1453 
1454 	mutex_enter(&rset->rcs_lock);
1455 
1456 	if (rctl_set_find(rset, hndl, &rctl) == -1)
1457 		panic("unknown resource control handle %d requested", hndl);
1458 	else
1459 		ret = rctl_model_value(rctl->rc_dict_entry, p,
1460 		    rctl->rc_cursor->rcv_value);
1461 
1462 	mutex_exit(&rset->rcs_lock);
1463 
1464 	return (ret);
1465 }
1466 
1467 /*
1468  * int rctl_global_get(const char *, rctl_dict_entry_t *)
1469  *
1470  * Overview
1471  *   Copy a sanitized version of the global rctl for a given resource control
1472  *   name.  (By sanitization, we mean that the unsafe data pointers have been
1473  *   zeroed.)
1474  *
1475  * Return value
1476  *   -1 if name not defined, 0 otherwise.
1477  *
1478  * Caller's context
1479  *   No restrictions on context.  rctl_dict_lock must not be held.
1480  */
1481 int
1482 rctl_global_get(const char *name, rctl_dict_entry_t *drde)
1483 {
1484 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1485 
1486 	if (rde == NULL)
1487 		return (-1);
1488 
1489 	bcopy(rde, drde, sizeof (rctl_dict_entry_t));
1490 
1491 	drde->rcd_next = NULL;
1492 	drde->rcd_ops = NULL;
1493 
1494 	return (0);
1495 }
1496 
1497 /*
1498  * int rctl_global_set(const char *, rctl_dict_entry_t *)
1499  *
1500  * Overview
1501  *   Transfer the settable fields of the named rctl to the global rctl matching
1502  *   the given resource control name.
1503  *
1504  * Return value
1505  *   -1 if name not defined, 0 otherwise.
1506  *
1507  * Caller's context
1508  *   No restrictions on context.  rctl_dict_lock must not be held.
1509  */
1510 int
1511 rctl_global_set(const char *name, rctl_dict_entry_t *drde)
1512 {
1513 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1514 
1515 	if (rde == NULL)
1516 		return (-1);
1517 
1518 	rde->rcd_flagaction = drde->rcd_flagaction;
1519 	rde->rcd_syslog_level = drde->rcd_syslog_level;
1520 	rde->rcd_strlog_flags = drde->rcd_strlog_flags;
1521 
1522 	return (0);
1523 }
1524 
1525 static int
1526 rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1527     int (*cbop)(rctl_hndl_t, struct proc *p, rctl_entity_p_t *e, rctl_t *,
1528     rctl_val_t *, rctl_val_t *), struct proc *p)
1529 {
1530 	rctl_t *rctl;
1531 	rctl_set_t *rset;
1532 	rctl_entity_p_t e;
1533 	int ret = 0;
1534 	rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
1535 
1536 local_op_retry:
1537 
1538 	ASSERT(MUTEX_HELD(&p->p_lock));
1539 
1540 	rset = rctl_entity_obtain_rset(rde, p);
1541 
1542 	if (rset == NULL) {
1543 		return (-1);
1544 	}
1545 	rctl_entity_obtain_entity_p(rset->rcs_entity, p, &e);
1546 
1547 	mutex_enter(&rset->rcs_lock);
1548 
1549 	/* using rctl's hndl, get rctl from local set */
1550 	if (rctl_set_find(rset, hndl, &rctl) == -1) {
1551 		mutex_exit(&rset->rcs_lock);
1552 		return (-1);
1553 	}
1554 
1555 	ret = cbop(hndl, p, &e, rctl, oval, nval);
1556 
1557 	mutex_exit(&rset->rcs_lock);
1558 	return (ret);
1559 }
1560 
1561 /*ARGSUSED*/
1562 static int
1563 rctl_local_get_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1564     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1565 {
1566 	if (oval == NULL) {
1567 		/*
1568 		 * RCTL_FIRST
1569 		 */
1570 		bcopy(rctl->rc_values, nval, sizeof (rctl_val_t));
1571 	} else {
1572 		/*
1573 		 * RCTL_NEXT
1574 		 */
1575 		rctl_val_t *tval = rctl_val_list_find(&rctl->rc_values, oval);
1576 
1577 		if (tval == NULL)
1578 			return (ESRCH);
1579 		else if (tval->rcv_next == NULL)
1580 			return (ENOENT);
1581 		else
1582 			bcopy(tval->rcv_next, nval, sizeof (rctl_val_t));
1583 	}
1584 
1585 	return (0);
1586 }
1587 
1588 /*
1589  * int rctl_local_get(rctl_hndl_t, rctl_val_t *)
1590  *
1591  * Overview
1592  *   Get the rctl value for the given flags.
1593  *
1594  * Return values
1595  *   0 for successful get, errno otherwise.
1596  */
1597 int
1598 rctl_local_get(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1599     struct proc *p)
1600 {
1601 	return (rctl_local_op(hndl, oval, nval, rctl_local_get_cb, p));
1602 }
1603 
1604 /*ARGSUSED*/
1605 static int
1606 rctl_local_delete_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1607     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1608 {
1609 	if ((oval = rctl_val_list_find(&rctl->rc_values, nval)) == NULL)
1610 		return (ESRCH);
1611 
1612 	if (rctl->rc_cursor == oval) {
1613 		rctl->rc_cursor = oval->rcv_next;
1614 		rctl_val_list_reset(rctl->rc_cursor);
1615 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1616 		    rctl->rc_cursor->rcv_value));
1617 
1618 		ASSERT(rctl->rc_cursor != NULL);
1619 	}
1620 
1621 	(void) rctl_val_list_delete(&rctl->rc_values, oval);
1622 
1623 	return (0);
1624 }
1625 
1626 /*
1627  * int rctl_local_delete(rctl_hndl_t, rctl_val_t *)
1628  *
1629  * Overview
1630  *   Delete the rctl value for the given flags.
1631  *
1632  * Return values
1633  *   0 for successful delete, errno otherwise.
1634  */
1635 int
1636 rctl_local_delete(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1637 {
1638 	return (rctl_local_op(hndl, NULL, val, rctl_local_delete_cb, p));
1639 }
1640 
1641 /*
1642  * rctl_local_insert_cb()
1643  *
1644  * Overview
1645  *   Insert a new value into the rctl's val list. If an error occurs,
1646  *   the val list must be left in the same state as when the function
1647  *   was entered.
1648  *
1649  * Return Values
1650  *   0 for successful insert, EINVAL if the value is duplicated in the
1651  *   existing list.
1652  */
1653 /*ARGSUSED*/
1654 static int
1655 rctl_local_insert_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1656     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1657 {
1658 	/*
1659 	 * Before inserting, confirm there are no duplicates of this value
1660 	 * and flag level. If there is a duplicate, flag an error and do
1661 	 * nothing.
1662 	 */
1663 	if (rctl_val_list_insert(&rctl->rc_values, nval) != 0)
1664 		return (EINVAL);
1665 
1666 	if (rctl_val_cmp(nval, rctl->rc_cursor, 0) < 0) {
1667 		rctl->rc_cursor = nval;
1668 		rctl_val_list_reset(rctl->rc_cursor);
1669 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1670 		    rctl->rc_cursor->rcv_value));
1671 
1672 		ASSERT(rctl->rc_cursor != NULL);
1673 	}
1674 
1675 	return (0);
1676 }
1677 
1678 /*
1679  * int rctl_local_insert(rctl_hndl_t, rctl_val_t *)
1680  *
1681  * Overview
1682  *   Insert the rctl value into the appropriate rctl set for the calling
1683  *   process, given the handle.
1684  */
1685 int
1686 rctl_local_insert(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1687 {
1688 	return (rctl_local_op(hndl, NULL, val, rctl_local_insert_cb, p));
1689 }
1690 
1691 static int
1692 rctl_local_replace_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1693     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1694 {
1695 	int ret;
1696 
1697 	/*
1698 	 * rctl_local_insert_cb() does the job of flagging an error
1699 	 * for any duplicate values. So, call rctl_local_insert_cb()
1700 	 * for the new value first, then do deletion of the old value.
1701 	 * Since this is a callback function to rctl_local_op, we can
1702 	 * count on rcs_lock being held at this point. This guarantees
1703 	 * that there is at no point a visible list which contains both
1704 	 * new and old values.
1705 	 */
1706 	if (ret = rctl_local_insert_cb(hndl, p, e, rctl, NULL, nval))
1707 		return (ret);
1708 
1709 	return (rctl_local_delete_cb(hndl, p, e, rctl, NULL, oval));
1710 }
1711 
1712 /*
1713  * int rctl_local_replace(rctl_hndl_t, void *, int, uint64_t *)
1714  *
1715  * Overview
1716  *   Replace the rctl value with a new one.
1717  *
1718  * Return values
1719  *   0 for successful replace, errno otherwise.
1720  */
1721 int
1722 rctl_local_replace(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1723     struct proc *p)
1724 {
1725 	return (rctl_local_op(hndl, oval, nval, rctl_local_replace_cb, p));
1726 }
1727 
1728 /*
1729  * int rctl_rlimit_get(rctl_hndl_t, struct proc *, struct rlimit64 *)
1730  *
1731  * Overview
1732  *   To support rlimit compatibility, we need a function which takes a 64-bit
1733  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
1734  *   This operation is only intended for legacy rlimits.
1735  */
1736 int
1737 rctl_rlimit_get(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64)
1738 {
1739 	rctl_t *rctl;
1740 	rctl_val_t *rval;
1741 	rctl_set_t *rset = p->p_rctls;
1742 	int soft_limit_seen = 0;
1743 	int test_for_deny = 1;
1744 
1745 	mutex_enter(&rset->rcs_lock);
1746 	if (rctl_set_find(rset, rc, &rctl) == -1) {
1747 		mutex_exit(&rset->rcs_lock);
1748 		return (-1);
1749 	}
1750 
1751 	rval = rctl->rc_values;
1752 
1753 	if (rctl->rc_dict_entry->rcd_flagaction & (RCTL_GLOBAL_DENY_NEVER |
1754 	    RCTL_GLOBAL_DENY_ALWAYS))
1755 		test_for_deny = 0;
1756 
1757 	/*
1758 	 * 1.  Find the first control value with the RCTL_LOCAL_DENY bit set.
1759 	 */
1760 	while (rval != NULL && rval->rcv_privilege != RCPRIV_SYSTEM) {
1761 		if (test_for_deny &&
1762 		    (rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0) {
1763 			rval = rval->rcv_next;
1764 			continue;
1765 		}
1766 
1767 		/*
1768 		 * 2.  If this is an RCPRIV_BASIC value, then we've found the
1769 		 * effective soft limit and should set rlim_cur.  We should then
1770 		 * continue looking for another control value with the DENY bit
1771 		 * set.
1772 		 */
1773 		if (rval->rcv_privilege == RCPRIV_BASIC) {
1774 			if (soft_limit_seen) {
1775 				rval = rval->rcv_next;
1776 				continue;
1777 			}
1778 
1779 			if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
1780 			    rval->rcv_value < rctl_model_maximum(
1781 			    rctl->rc_dict_entry, p))
1782 				rlp64->rlim_cur = rval->rcv_value;
1783 			else
1784 				rlp64->rlim_cur = RLIM64_INFINITY;
1785 			soft_limit_seen = 1;
1786 
1787 			rval = rval->rcv_next;
1788 			continue;
1789 		}
1790 
1791 		/*
1792 		 * 3.  This is an RCPRIV_PRIVILEGED value.  If we haven't found
1793 		 * a soft limit candidate, then we've found the effective hard
1794 		 * and soft limits and should set both  If we had found a soft
1795 		 * limit, then this is only the hard limit and we need only set
1796 		 * rlim_max.
1797 		 */
1798 		if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
1799 		    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry,
1800 		    p))
1801 			rlp64->rlim_max = rval->rcv_value;
1802 		else
1803 			rlp64->rlim_max = RLIM64_INFINITY;
1804 		if (!soft_limit_seen)
1805 			rlp64->rlim_cur = rlp64->rlim_max;
1806 
1807 		mutex_exit(&rset->rcs_lock);
1808 		return (0);
1809 	}
1810 
1811 	if (rval == NULL) {
1812 		/*
1813 		 * This control sequence is corrupt, as it is not terminated by
1814 		 * a system privileged control value.
1815 		 */
1816 		mutex_exit(&rset->rcs_lock);
1817 		return (-1);
1818 	}
1819 
1820 	/*
1821 	 * 4.  If we run into a RCPRIV_SYSTEM value, then the hard limit (and
1822 	 * the soft, if we haven't a soft candidate) should be the value of the
1823 	 * system control value.
1824 	 */
1825 	if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
1826 	    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry, p))
1827 		rlp64->rlim_max = rval->rcv_value;
1828 	else
1829 		rlp64->rlim_max = RLIM64_INFINITY;
1830 
1831 	if (!soft_limit_seen)
1832 		rlp64->rlim_cur = rlp64->rlim_max;
1833 
1834 	mutex_exit(&rset->rcs_lock);
1835 	return (0);
1836 }
1837 
1838 /*
1839  * rctl_alloc_gp_t *rctl_rlimit_set_prealloc(uint_t)
1840  *
1841  * Overview
1842  *   Before making a series of calls to rctl_rlimit_set(), we must have a
1843  *   preallocated batch of resource control values, as rctl_rlimit_set() can
1844  *   potentially consume two resource control values per call.
1845  *
1846  * Return values
1847  *   A populated resource control allocation group with 2n resource control
1848  *   values.
1849  *
1850  * Caller's context
1851  *   Must be safe for KM_SLEEP allocations.
1852  */
1853 rctl_alloc_gp_t *
1854 rctl_rlimit_set_prealloc(uint_t n)
1855 {
1856 	rctl_alloc_gp_t *gp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1857 
1858 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1859 
1860 	gp->rcag_nvals = 2 * n;
1861 
1862 	rctl_gp_alloc(gp);
1863 
1864 	return (gp);
1865 }
1866 
1867 /*
1868  * int rctl_rlimit_set(rctl_hndl_t, struct proc *, struct rlimit64 *, int,
1869  *   int)
1870  *
1871  * Overview
1872  *   To support rlimit compatibility, we need a function which takes a 64-bit
1873  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
1874  *   This operation is only intended for legacy rlimits.
1875  *
1876  *   The implementation of rctl_rlimit_set() is a bit clever, as it tries to
1877  *   minimize the number of values placed on the value sequence in various
1878  *   cases.  Furthermore, we don't allow multiple identical privilege-action
1879  *   values on the same sequence.  (That is, we don't want a sequence like
1880  *   "while (1) { rlim.rlim_cur++; setrlimit(..., rlim); }" to exhaust kernel
1881  *   memory.)  So we want to delete any values with the same privilege value and
1882  *   action.
1883  *
1884  * Return values
1885  *   0 for successful set, errno otherwise. Errno will be either EINVAL
1886  *   or EPERM, in keeping with defined errnos for ulimit() and setrlimit()
1887  *   system calls.
1888  */
1889 /*ARGSUSED*/
1890 int
1891 rctl_rlimit_set(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64,
1892     rctl_alloc_gp_t *ragp, int flagaction, int signal, const cred_t *cr)
1893 {
1894 	rctl_t *rctl;
1895 	rctl_val_t *rval, *rval_priv, *rval_basic;
1896 	rctl_set_t *rset = p->p_rctls;
1897 	rctl_qty_t max;
1898 	rctl_entity_p_t e;
1899 	struct rlimit64 cur_rl;
1900 
1901 	e.rcep_t = RCENTITY_PROCESS;
1902 	e.rcep_p.proc = p;
1903 
1904 	if (rlp64->rlim_cur > rlp64->rlim_max)
1905 		return (EINVAL);
1906 
1907 	if (rctl_rlimit_get(rc, p, &cur_rl) == -1)
1908 		return (EINVAL);
1909 
1910 	/*
1911 	 * If we are not privileged, we can only lower the hard limit.
1912 	 */
1913 	if ((rlp64->rlim_max > cur_rl.rlim_max) &&
1914 	    cur_rl.rlim_max != RLIM64_INFINITY &&
1915 	    secpolicy_resource(cr) != 0)
1916 		return (EPERM);
1917 
1918 	mutex_enter(&rset->rcs_lock);
1919 
1920 	if (rctl_set_find(rset, rc, &rctl) == -1) {
1921 		mutex_exit(&rset->rcs_lock);
1922 		return (EINVAL);
1923 	}
1924 
1925 	rval_priv = rctl_gp_detach_val(ragp);
1926 
1927 	rval = rctl->rc_values;
1928 
1929 	while (rval != NULL) {
1930 		rctl_val_t *next = rval->rcv_next;
1931 
1932 		if (rval->rcv_privilege == RCPRIV_SYSTEM)
1933 			break;
1934 
1935 		if ((rval->rcv_privilege == RCPRIV_BASIC) ||
1936 		    (rval->rcv_flagaction & ~RCTL_LOCAL_ACTION_MASK) ==
1937 		    (flagaction & ~RCTL_LOCAL_ACTION_MASK)) {
1938 			if (rctl->rc_cursor == rval) {
1939 				rctl->rc_cursor = rval->rcv_next;
1940 				rctl_val_list_reset(rctl->rc_cursor);
1941 				RCTLOP_SET(rctl, p, &e, rctl_model_value(
1942 				    rctl->rc_dict_entry, p,
1943 				    rctl->rc_cursor->rcv_value));
1944 			}
1945 			(void) rctl_val_list_delete(&rctl->rc_values, rval);
1946 		}
1947 
1948 		rval = next;
1949 	}
1950 
1951 	rval_priv->rcv_privilege = RCPRIV_PRIVILEGED;
1952 	rval_priv->rcv_flagaction = flagaction;
1953 	if (rlp64->rlim_max == RLIM64_INFINITY) {
1954 		rval_priv->rcv_flagaction |= RCTL_LOCAL_MAXIMAL;
1955 		max = rctl->rc_dict_entry->rcd_max_native;
1956 	} else {
1957 		max = rlp64->rlim_max;
1958 	}
1959 	rval_priv->rcv_value = max;
1960 	rval_priv->rcv_action_signal = signal;
1961 	rval_priv->rcv_action_recipient = NULL;
1962 	rval_priv->rcv_action_recip_pid = -1;
1963 	rval_priv->rcv_firing_time = 0;
1964 	rval_priv->rcv_prev = rval_priv->rcv_next = NULL;
1965 
1966 	(void) rctl_val_list_insert(&rctl->rc_values, rval_priv);
1967 	rctl->rc_cursor = rval_priv;
1968 	rctl_val_list_reset(rctl->rc_cursor);
1969 	RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
1970 	    rctl->rc_cursor->rcv_value));
1971 
1972 	if (rlp64->rlim_cur != RLIM64_INFINITY && rlp64->rlim_cur < max) {
1973 		rval_basic = rctl_gp_detach_val(ragp);
1974 
1975 		rval_basic->rcv_privilege = RCPRIV_BASIC;
1976 		rval_basic->rcv_value = rlp64->rlim_cur;
1977 		rval_basic->rcv_flagaction = flagaction;
1978 		rval_basic->rcv_action_signal = signal;
1979 		rval_basic->rcv_action_recipient = p;
1980 		rval_basic->rcv_action_recip_pid = p->p_pid;
1981 		rval_basic->rcv_firing_time = 0;
1982 		rval_basic->rcv_prev = rval_basic->rcv_next = NULL;
1983 
1984 		(void) rctl_val_list_insert(&rctl->rc_values, rval_basic);
1985 		rctl->rc_cursor = rval_basic;
1986 		rctl_val_list_reset(rctl->rc_cursor);
1987 		RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
1988 		    rctl->rc_cursor->rcv_value));
1989 	}
1990 
1991 	ASSERT(rctl->rc_cursor != NULL);
1992 
1993 	mutex_exit(&rset->rcs_lock);
1994 	return (0);
1995 }
1996 
1997 
1998 /*
1999  * rctl_hndl_t rctl_register(const char *, rctl_entity_t, int, rlim64_t,
2000  *   rlim64_t, rctl_ops_t *)
2001  *
2002  * Overview
2003  *   rctl_register() performs a look-up in the dictionary of rctls
2004  *   active on the system; if a rctl of that name is absent, an entry is
2005  *   made into the dictionary.  The rctl is returned with its reference
2006  *   count incremented by one.  If the rctl name already exists, we panic.
2007  *   (Were the resource control system to support dynamic loading and unloading,
2008  *   which it is structured for, duplicate registration should lead to load
2009  *   failure instead of panicking.)
2010  *
2011  *   Each registered rctl has a requirement that a RCPRIV_SYSTEM limit be
2012  *   defined.  This limit contains the highest possible value for this quantity
2013  *   on the system.  Furthermore, the registered control must provide infinite
2014  *   values for all applicable address space models supported by the operating
2015  *   system.  Attempts to set resource control values beyond the system limit
2016  *   will fail.
2017  *
2018  * Return values
2019  *   The rctl's ID.
2020  *
2021  * Caller's context
2022  *   Caller must be in a context suitable for KM_SLEEP allocations.
2023  */
2024 rctl_hndl_t
2025 rctl_register(
2026     const char *name,
2027     rctl_entity_t entity,
2028     int global_flags,
2029     rlim64_t max_native,
2030     rlim64_t max_ilp32,
2031     rctl_ops_t *ops)
2032 {
2033 	rctl_t *rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
2034 	rctl_val_t *rctl_val = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2035 	rctl_dict_entry_t *rctl_de = kmem_zalloc(sizeof (rctl_dict_entry_t),
2036 	    KM_SLEEP);
2037 	rctl_t *old_rctl;
2038 	rctl_hndl_t rhndl;
2039 	int localflags;
2040 
2041 	ASSERT(ops != NULL);
2042 
2043 	bzero(rctl, sizeof (rctl_t));
2044 	bzero(rctl_val, sizeof (rctl_val_t));
2045 
2046 	if (global_flags & RCTL_GLOBAL_DENY_NEVER)
2047 		localflags = RCTL_LOCAL_MAXIMAL;
2048 	else
2049 		localflags = RCTL_LOCAL_MAXIMAL | RCTL_LOCAL_DENY;
2050 
2051 	rctl_val->rcv_privilege = RCPRIV_SYSTEM;
2052 	rctl_val->rcv_value = max_native;
2053 	rctl_val->rcv_flagaction = localflags;
2054 	rctl_val->rcv_action_signal = 0;
2055 	rctl_val->rcv_action_recipient = NULL;
2056 	rctl_val->rcv_action_recip_pid = -1;
2057 	rctl_val->rcv_firing_time = 0;
2058 	rctl_val->rcv_next = NULL;
2059 	rctl_val->rcv_prev = NULL;
2060 
2061 	rctl_de->rcd_name = (char *)name;
2062 	rctl_de->rcd_default_value = rctl_val;
2063 	rctl_de->rcd_max_native = max_native;
2064 	rctl_de->rcd_max_ilp32 = max_ilp32;
2065 	rctl_de->rcd_entity = entity;
2066 	rctl_de->rcd_ops = ops;
2067 	rctl_de->rcd_flagaction = global_flags;
2068 
2069 	rctl->rc_dict_entry = rctl_de;
2070 	rctl->rc_values = rctl_val;
2071 
2072 	/*
2073 	 * 1.  Take global lock, validate nonexistence of name, get ID.
2074 	 */
2075 	mutex_enter(&rctl_dict_lock);
2076 
2077 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
2078 	    (mod_hash_val_t *)&rhndl) != MH_ERR_NOTFOUND)
2079 		panic("duplicate registration of rctl %s", name);
2080 
2081 	rhndl = rctl_de->rcd_id = rctl->rc_id =
2082 	    (rctl_hndl_t)id_alloc(rctl_ids);
2083 
2084 	/*
2085 	 * 2.  Insert name-entry pair in rctl_dict_by_name.
2086 	 */
2087 	if (mod_hash_insert(rctl_dict_by_name, (mod_hash_key_t)name,
2088 	    (mod_hash_val_t)rctl_de))
2089 		panic("unable to insert rctl dict entry for %s (%u)", name,
2090 		    (uint_t)rctl->rc_id);
2091 
2092 	/*
2093 	 * 3.  Insert ID-rctl_t * pair in rctl_dict.
2094 	 */
2095 	if (mod_hash_find(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2096 	    (mod_hash_val_t *)&old_rctl) != MH_ERR_NOTFOUND)
2097 		panic("duplicate rctl ID %u registered", rctl->rc_id);
2098 
2099 	if (mod_hash_insert(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2100 	    (mod_hash_val_t)rctl))
2101 		panic("unable to insert rctl %s/%u (%p)", name,
2102 		    (uint_t)rctl->rc_id, rctl);
2103 
2104 	/*
2105 	 * 3a. Insert rctl_dict_entry_t * in appropriate entity list.
2106 	 */
2107 
2108 	mutex_enter(&rctl_lists_lock);
2109 
2110 	switch (entity) {
2111 	case RCENTITY_ZONE:
2112 	case RCENTITY_PROJECT:
2113 	case RCENTITY_TASK:
2114 	case RCENTITY_PROCESS:
2115 		rctl_de->rcd_next = rctl_lists[entity];
2116 		rctl_lists[entity] = rctl_de;
2117 		break;
2118 	default:
2119 		panic("registering unknown rctl entity %d (%s)", entity,
2120 		    name);
2121 		break;
2122 	}
2123 
2124 	mutex_exit(&rctl_lists_lock);
2125 
2126 	/*
2127 	 * 4.  Drop lock.
2128 	 */
2129 	mutex_exit(&rctl_dict_lock);
2130 
2131 	return (rhndl);
2132 }
2133 
2134 /*
2135  * static int rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p,
2136  *    rctl_val_t *v)
2137  *
2138  * Overview
2139  *   rctl_global_action() takes, in according with the flags on the rctl_dict
2140  *   entry for the given control, the appropriate actions on the exceeded
2141  *   control value.  Additionally, rctl_global_action() updates the firing time
2142  *   on the exceeded value.
2143  *
2144  * Return values
2145  *   A bitmask reflecting the actions actually taken.
2146  *
2147  * Caller's context
2148  *   No restrictions on context.
2149  */
2150 /*ARGSUSED*/
2151 static int
2152 rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v)
2153 {
2154 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2155 	const char *pr, *en;
2156 	id_t id;
2157 	int ret = 0;
2158 
2159 	v->rcv_firing_time = gethrtime();
2160 
2161 	switch (v->rcv_privilege) {
2162 	case RCPRIV_BASIC:
2163 		pr = "basic";
2164 		break;
2165 	case RCPRIV_PRIVILEGED:
2166 		pr = "privileged";
2167 		break;
2168 	case RCPRIV_SYSTEM:
2169 		pr = "system";
2170 		break;
2171 	default:
2172 		pr = "unknown";
2173 		break;
2174 	}
2175 
2176 	switch (rde->rcd_entity) {
2177 	case RCENTITY_PROCESS:
2178 		en = "process";
2179 		id = p->p_pid;
2180 		break;
2181 	case RCENTITY_TASK:
2182 		en = "task";
2183 		id = p->p_task->tk_tkid;
2184 		break;
2185 	case RCENTITY_PROJECT:
2186 		en = "project";
2187 		id = p->p_task->tk_proj->kpj_id;
2188 		break;
2189 	case RCENTITY_ZONE:
2190 		en = "zone";
2191 		id = p->p_zone->zone_id;
2192 		break;
2193 	default:
2194 		en = "unknown entity associated with pid";
2195 		id = p->p_pid;
2196 		break;
2197 	}
2198 
2199 	if (rde->rcd_flagaction & RCTL_GLOBAL_SYSLOG) {
2200 		(void) strlog(0, 0, 0,
2201 		    rde->rcd_strlog_flags | log_global.lz_active,
2202 		    "%s rctl %s (value %llu) exceeded by %s %d", pr,
2203 		    rde->rcd_name, v->rcv_value, en, id);
2204 	}
2205 
2206 	if (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS)
2207 		ret |= RCT_DENY;
2208 
2209 	return (ret);
2210 }
2211 
2212 static int
2213 rctl_local_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v,
2214     uint_t safety)
2215 {
2216 	int ret = 0;
2217 	sigqueue_t *sqp = NULL;
2218 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2219 	int unobservable = (rde->rcd_flagaction & RCTL_GLOBAL_UNOBSERVABLE);
2220 
2221 	proc_t *recipient = v->rcv_action_recipient;
2222 	id_t recip_pid = v->rcv_action_recip_pid;
2223 	int recip_signal = v->rcv_action_signal;
2224 	uint_t flagaction = v->rcv_flagaction;
2225 
2226 	if (safety == RCA_UNSAFE_ALL) {
2227 		if (flagaction & RCTL_LOCAL_DENY) {
2228 			ret |= RCT_DENY;
2229 		}
2230 		return (ret);
2231 	}
2232 
2233 	if (flagaction & RCTL_LOCAL_SIGNAL) {
2234 		/*
2235 		 * We can build a siginfo only in the case that it is
2236 		 * safe for us to drop p_lock.  (For asynchronous
2237 		 * checks this is currently not true.)
2238 		 */
2239 		if (safety == RCA_SAFE) {
2240 			mutex_exit(&rset->rcs_lock);
2241 			mutex_exit(&p->p_lock);
2242 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
2243 			mutex_enter(&p->p_lock);
2244 			mutex_enter(&rset->rcs_lock);
2245 
2246 			sqp->sq_info.si_signo = recip_signal;
2247 			sqp->sq_info.si_code = SI_RCTL;
2248 			sqp->sq_info.si_errno = 0;
2249 			sqp->sq_info.si_entity = (int)rde->rcd_entity;
2250 		}
2251 
2252 		if (recipient == NULL || recipient == p) {
2253 			ret |= RCT_SIGNAL;
2254 
2255 			if (sqp == NULL) {
2256 				sigtoproc(p, NULL, recip_signal);
2257 			} else if (p == curproc) {
2258 				/*
2259 				 * Then this is a synchronous test and we can
2260 				 * direct the signal at the violating thread.
2261 				 */
2262 				sigaddqa(curproc, curthread, sqp);
2263 			} else {
2264 				sigaddqa(p, NULL, sqp);
2265 			}
2266 		} else if (!unobservable) {
2267 			proc_t *rp;
2268 
2269 			mutex_exit(&rset->rcs_lock);
2270 			mutex_exit(&p->p_lock);
2271 
2272 			mutex_enter(&pidlock);
2273 			if ((rp = prfind(recip_pid)) == recipient) {
2274 				/*
2275 				 * Recipient process is still alive, but may not
2276 				 * be in this task or project any longer.  In
2277 				 * this case, the recipient's resource control
2278 				 * set pertinent to this control will have
2279 				 * changed--and we will not deliver the signal,
2280 				 * as the recipient process is trying to tear
2281 				 * itself off of its former set.
2282 				 */
2283 				mutex_enter(&rp->p_lock);
2284 				mutex_exit(&pidlock);
2285 
2286 				if (rctl_entity_obtain_rset(rde, rp) == rset) {
2287 					ret |= RCT_SIGNAL;
2288 
2289 					if (sqp == NULL)
2290 						sigtoproc(rp, NULL,
2291 						    recip_signal);
2292 					else
2293 						sigaddqa(rp, NULL, sqp);
2294 				} else if (sqp) {
2295 					kmem_free(sqp, sizeof (sigqueue_t));
2296 				}
2297 				mutex_exit(&rp->p_lock);
2298 			} else {
2299 				mutex_exit(&pidlock);
2300 				if (sqp)
2301 					kmem_free(sqp, sizeof (sigqueue_t));
2302 			}
2303 
2304 			mutex_enter(&p->p_lock);
2305 			/*
2306 			 * Since we dropped p_lock, we may no longer be in the
2307 			 * same task or project as we were at entry.  It is thus
2308 			 * unsafe for us to reacquire the set lock at this
2309 			 * point; callers of rctl_local_action() must handle
2310 			 * this possibility.
2311 			 */
2312 			ret |= RCT_LK_ABANDONED;
2313 		} else if (sqp) {
2314 			kmem_free(sqp, sizeof (sigqueue_t));
2315 		}
2316 	}
2317 
2318 	if ((flagaction & RCTL_LOCAL_DENY) &&
2319 	    (recipient == NULL || recipient == p)) {
2320 		ret |= RCT_DENY;
2321 	}
2322 
2323 	return (ret);
2324 }
2325 
2326 /*
2327  * int rctl_action(rctl_hndl_t, rctl_set_t *, struct proc *, uint_t)
2328  *
2329  * Overview
2330  *   Take the action associated with the enforced value (as defined by
2331  *   rctl_get_enforced_value()) being exceeded or encountered.  Possibly perform
2332  *   a restricted subset of the available actions, if circumstances dictate that
2333  *   we cannot safely allocate memory (for a sigqueue_t) or guarantee process
2334  *   persistence across the duration of the function (an asynchronous action).
2335  *
2336  * Return values
2337  *   Actions taken, according to the rctl_test bitmask.
2338  *
2339  * Caller's context
2340  *   Safe to acquire rcs_lock.
2341  */
2342 int
2343 rctl_action(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p, uint_t safety)
2344 {
2345 	return (rctl_action_entity(hndl, rset, p, NULL, safety));
2346 }
2347 
2348 int
2349 rctl_action_entity(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p,
2350     rctl_entity_p_t *e, uint_t safety)
2351 {
2352 	int ret = RCT_NONE;
2353 	rctl_t *lrctl;
2354 	rctl_entity_p_t e_tmp;
2355 
2356 rctl_action_acquire:
2357 	mutex_enter(&rset->rcs_lock);
2358 	if (rctl_set_find(rset, hndl, &lrctl) == -1) {
2359 		mutex_exit(&rset->rcs_lock);
2360 		return (ret);
2361 	}
2362 
2363 	if (e == NULL) {
2364 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2365 		p, &e_tmp);
2366 		e = &e_tmp;
2367 	}
2368 
2369 	if ((ret & RCT_LK_ABANDONED) == 0) {
2370 		ret |= rctl_global_action(lrctl, rset, p, lrctl->rc_cursor);
2371 
2372 		RCTLOP_ACTION(lrctl, p, e);
2373 
2374 		ret |= rctl_local_action(lrctl, rset, p,
2375 		    lrctl->rc_cursor, safety);
2376 
2377 		if (ret & RCT_LK_ABANDONED)
2378 			goto rctl_action_acquire;
2379 	}
2380 
2381 	ret &= ~RCT_LK_ABANDONED;
2382 
2383 	if (!(ret & RCT_DENY) &&
2384 	    lrctl->rc_cursor->rcv_next != NULL) {
2385 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2386 
2387 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2388 		    p, lrctl->rc_cursor->rcv_value));
2389 
2390 	}
2391 	mutex_exit(&rset->rcs_lock);
2392 
2393 	return (ret);
2394 }
2395 
2396 /*
2397  * int rctl_test(rctl_hndl_t, rctl_set_t *, struct proc *, rctl_qty_t, uint_t)
2398  *
2399  * Overview
2400  *   Increment the resource associated with the given handle, returning zero if
2401  *   the incremented value does not exceed the threshold for the current limit
2402  *   on the resource.
2403  *
2404  * Return values
2405  *   Actions taken, according to the rctl_test bitmask.
2406  *
2407  * Caller's context
2408  *   p_lock held by caller.
2409  */
2410 /*ARGSUSED*/
2411 int
2412 rctl_test(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2413     rctl_qty_t incr, uint_t flags)
2414 {
2415 	return (rctl_test_entity(rhndl, rset, p, NULL, incr, flags));
2416 }
2417 
2418 int
2419 rctl_test_entity(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2420     rctl_entity_p_t *e, rctl_qty_t incr, uint_t flags)
2421 {
2422 	rctl_t *lrctl;
2423 	int ret = RCT_NONE;
2424 	rctl_entity_p_t e_tmp;
2425 	if (p == &p0) {
2426 		/*
2427 		 * We don't enforce rctls on the kernel itself.
2428 		 */
2429 		return (ret);
2430 	}
2431 
2432 rctl_test_acquire:
2433 	ASSERT(MUTEX_HELD(&p->p_lock));
2434 
2435 	mutex_enter(&rset->rcs_lock);
2436 
2437 	/*
2438 	 * Dereference from rctl_set.  We don't enforce newly loaded controls
2439 	 * that haven't been set on this entity (since the only valid value is
2440 	 * the infinite system value).
2441 	 */
2442 	if (rctl_set_find(rset, rhndl, &lrctl) == -1) {
2443 		mutex_exit(&rset->rcs_lock);
2444 		return (ret);
2445 	}
2446 
2447 	/*
2448 	 * This control is currently unenforced:  maximal value on control
2449 	 * supporting infinitely available resource.
2450 	 */
2451 	if ((lrctl->rc_dict_entry->rcd_flagaction & RCTL_GLOBAL_INFINITE) &&
2452 	    (lrctl->rc_cursor->rcv_flagaction & RCTL_LOCAL_MAXIMAL)) {
2453 
2454 		mutex_exit(&rset->rcs_lock);
2455 		return (ret);
2456 	}
2457 
2458 	/*
2459 	 * If we have been called by rctl_test, look up the entity pointer
2460 	 * from the proc pointer.
2461 	 */
2462 	if (e == NULL) {
2463 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2464 		p, &e_tmp);
2465 		e = &e_tmp;
2466 	}
2467 
2468 	/*
2469 	 * Get enforced rctl value and current usage.  Test the increment
2470 	 * with the current usage against the enforced value--take action as
2471 	 * necessary.
2472 	 */
2473 	while (RCTLOP_TEST(lrctl, p, e, lrctl->rc_cursor, incr, flags)) {
2474 		if ((ret & RCT_LK_ABANDONED) == 0) {
2475 			ret |= rctl_global_action(lrctl, rset, p,
2476 			    lrctl->rc_cursor);
2477 
2478 			RCTLOP_ACTION(lrctl, p, e);
2479 
2480 			ret |= rctl_local_action(lrctl, rset, p,
2481 			    lrctl->rc_cursor, flags);
2482 
2483 			if (ret & RCT_LK_ABANDONED)
2484 				goto rctl_test_acquire;
2485 		}
2486 
2487 		ret &= ~RCT_LK_ABANDONED;
2488 
2489 		if ((ret & RCT_DENY) == RCT_DENY ||
2490 		    lrctl->rc_cursor->rcv_next == NULL) {
2491 			ret |= RCT_DENY;
2492 			break;
2493 		}
2494 
2495 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2496 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2497 		    p, lrctl->rc_cursor->rcv_value));
2498 	}
2499 
2500 	mutex_exit(&rset->rcs_lock);
2501 
2502 	return (ret);
2503 }
2504 
2505 /*
2506  * void rctl_init(void)
2507  *
2508  * Overview
2509  *   Initialize the rctl subsystem, including the primoridal rctls
2510  *   provided by the system.  New subsystem-specific rctls should _not_ be
2511  *   initialized here.  (Do it in your own file.)
2512  *
2513  * Return values
2514  *   None.
2515  *
2516  * Caller's context
2517  *   Safe for KM_SLEEP allocations.  Must be called prior to any process model
2518  *   initialization.
2519  */
2520 void
2521 rctl_init(void)
2522 {
2523 	rctl_cache = kmem_cache_create("rctl_cache", sizeof (rctl_t),
2524 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2525 	rctl_val_cache = kmem_cache_create("rctl_val_cache",
2526 	    sizeof (rctl_val_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
2527 
2528 	rctl_dict = mod_hash_create_extended("rctl_dict",
2529 	    rctl_dict_size, mod_hash_null_keydtor, rctl_dict_val_dtor,
2530 	    rctl_dict_hash_by_id, NULL, rctl_dict_id_cmp, KM_SLEEP);
2531 	rctl_dict_by_name = mod_hash_create_strhash(
2532 	    "rctl_handles_by_name", rctl_dict_size,
2533 	    mod_hash_null_valdtor);
2534 	rctl_ids = id_space_create("rctl_ids", 1, max_rctl_hndl);
2535 	bzero(rctl_lists, (RC_MAX_ENTITY + 1) * sizeof (rctl_dict_entry_t *));
2536 
2537 	rctlproc_init();
2538 }
2539