xref: /illumos-gate/usr/src/uts/common/os/rctl.c (revision 98677c366f39bc9e671513615d9b1a2c6f15621d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/atomic.h>
29 #include <sys/cmn_err.h>
30 #include <sys/id_space.h>
31 #include <sys/kmem.h>
32 #include <sys/log.h>
33 #include <sys/modctl.h>
34 #include <sys/modhash.h>
35 #include <sys/mutex.h>
36 #include <sys/proc.h>
37 #include <sys/procset.h>
38 #include <sys/project.h>
39 #include <sys/resource.h>
40 #include <sys/rctl.h>
41 #include <sys/siginfo.h>
42 #include <sys/strlog.h>
43 #include <sys/systm.h>
44 #include <sys/task.h>
45 #include <sys/types.h>
46 #include <sys/policy.h>
47 #include <sys/zone.h>
48 
49 /*
50  * Resource controls (rctls)
51  *
52  *   The rctl subsystem provides a mechanism for kernel components to
53  *   register their individual resource controls with the system as a whole,
54  *   such that those controls can subscribe to specific actions while being
55  *   associated with the various process-model entities provided by the kernel:
56  *   the process, the task, the project, and the zone.  (In principle, only
57  *   minor modifications would be required to connect the resource control
58  *   functionality to non-process-model entities associated with the system.)
59  *
60  *   Subsystems register their rctls via rctl_register().  Subsystems
61  *   also wishing to provide additional limits on a given rctl can modify
62  *   them once they have the rctl handle.  Each subsystem should store the
63  *   handle to their rctl for direct access.
64  *
65  *   A primary dictionary, rctl_dict, contains a hash of id to the default
66  *   control definition for each controlled resource-entity pair on the system.
67  *   A secondary dictionary, rctl_dict_by_name, contains a hash of name to
68  *   resource control handles.  The resource control handles are distributed by
69  *   the rctl_ids ID space.  The handles are private and not to be
70  *   advertised to userland; all userland interactions are via the rctl
71  *   names.
72  *
73  *   Entities inherit their rctls from their predecessor.  Since projects have
74  *   no ancestor, they inherit their rctls from the rctl dict for project
75  *   rctls.  It is expected that project controls will be set to their
76  *   appropriate values shortly after project creation, presumably from a
77  *   policy source such as the project database.
78  *
79  * Data structures
80  *   The rctl_set_t attached to each of the process model entities is a simple
81  *   hash table keyed on the rctl handle assigned at registration.  The entries
82  *   in the hash table are rctl_t's, whose relationship with the active control
83  *   values on that resource and with the global state of the resource we
84  *   illustrate below:
85  *
86  *   rctl_dict[key] --> rctl_dict_entry
87  *			   ^
88  *			   |
89  *			+--+---+
90  *   rctl_set[key] ---> | rctl | --> value <-> value <-> system value --> NULL
91  *			+--+---+		 ^
92  *			   |			 |
93  *			   +------- cursor ------+
94  *
95  *   That is, the rctl contains a back pointer to the global resource control
96  *   state for this resource, which is also available in the rctl_dict hash
97  *   table mentioned earlier.  The rctl contains two pointers to resource
98  *   control values:  one, values, indicates the entire sequence of control
99  *   values; the other, cursor, indicates the currently active control
100  *   value--the next value to be enforced.  The value list itself is an open,
101  *   doubly-linked list, the last non-NULL member of which is the system value
102  *   for that resource (being the theoretical/conventional maximum allowable
103  *   value for the resource on this OS instance).
104  *
105  * Ops Vector
106  *   Subsystems publishing rctls need not provide instances of all of the
107  *   functions specified by the ops vector.  In particular, if general
108  *   rctl_*() entry points are not being called, certain functions can be
109  *   omitted.  These align as follows:
110  *
111  *   rctl_set()
112  *     You may wish to provide a set callback if locking circumstances prevent
113  *     it or if the performance cost of requesting the enforced value from the
114  *     resource control is prohibitively expensive.  For instance, the currently
115  *     enforced file size limit is stored on the process in the p_fsz_ctl to
116  *     maintain read()/write() performance.
117  *
118  *   rctl_test()
119  *     You must provide a test callback if you are using the rctl_test()
120  *     interface.  An action callback is optional.
121  *
122  *   rctl_action()
123  *     You may wish to provide an action callback.
124  *
125  * Registration
126  *   New resource controls can be added to a running instance by loaded modules
127  *   via registration.  (The current implementation does not support unloadable
128  *   modules; this functionality can be added if needed, via an
129  *   activation/deactivation interface involving the manipulation of the
130  *   ops vector for the resource control(s) needing to support unloading.)
131  *
132  * Control value ordering
133  *   Because the rctl_val chain on each rctl must be navigable in a
134  *   deterministic way, we have to define an ordering on the rctl_val_t's.  The
135  *   defined order is (flags & [maximal], value, flags & [deny-action],
136  *   privilege).
137  *
138  * Locking
139  *   rctl_dict_lock must be acquired prior to rctl_lists_lock.  Since
140  *   rctl_dict_lock or rctl_lists_lock can be called at the enforcement point
141  *   of any subsystem, holding subsystem locks, it is at all times inappropriate
142  *   to call kmem_alloc(., KM_SLEEP) while holding either of these locks.
143  *   Traversing any of the various resource control entity lists requires
144  *   holding rctl_lists_lock.
145  *
146  *   Each individual resource control set associated with an entity must have
147  *   its rcs_lock held for the duration of any operations that would add
148  *   resource controls or control values to the set.
149  *
150  *   The locking subsequence of interest is: p_lock, rctl_dict_lock,
151  *   rctl_lists_lock, entity->rcs_lock.
152  */
153 
154 id_t max_rctl_hndl = 32768;
155 int rctl_dict_size = 64;
156 int rctl_set_size = 8;
157 kmutex_t rctl_dict_lock;
158 mod_hash_t *rctl_dict;
159 mod_hash_t *rctl_dict_by_name;
160 id_space_t *rctl_ids;
161 kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
162 kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
163 
164 kmutex_t rctl_lists_lock;
165 rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
166 
167 /*
168  * Default resource control operations and ops vector
169  *   To be used if the particular rcontrol has no specific actions defined, or
170  *   if the subsystem providing the control is quiescing (in preparation for
171  *   unloading, presumably.)
172  *
173  *   Resource controls with callbacks should fill the unused operations with the
174  *   appropriate default impotent callback.
175  */
176 /*ARGSUSED*/
177 void
178 rcop_no_action(struct rctl *r, struct proc *p, rctl_entity_p_t *e)
179 {
180 }
181 
182 /*ARGSUSED*/
183 rctl_qty_t
184 rcop_no_usage(struct rctl *r, struct proc *p)
185 {
186 	return (0);
187 }
188 
189 /*ARGSUSED*/
190 int
191 rcop_no_set(struct rctl *r, struct proc *p, rctl_entity_p_t *e, rctl_qty_t l)
192 {
193 	return (0);
194 }
195 
196 /*ARGSUSED*/
197 int
198 rcop_no_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
199     struct rctl_val *rv, rctl_qty_t i, uint_t f)
200 {
201 	return (0);
202 }
203 
204 rctl_ops_t rctl_default_ops = {
205 	rcop_no_action,
206 	rcop_no_usage,
207 	rcop_no_set,
208 	rcop_no_test
209 };
210 
211 /*
212  * Default "absolute" resource control operation and ops vector
213  *   Useful if there is no usage associated with the
214  *   resource control.
215  */
216 /*ARGSUSED*/
217 int
218 rcop_absolute_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
219     struct rctl_val *rv, rctl_qty_t i, uint_t f)
220 {
221 	return (i > rv->rcv_value);
222 }
223 
224 rctl_ops_t rctl_absolute_ops = {
225 	rcop_no_action,
226 	rcop_no_usage,
227 	rcop_no_set,
228 	rcop_absolute_test
229 };
230 
231 /*ARGSUSED*/
232 static uint_t
233 rctl_dict_hash_by_id(void *hash_data, mod_hash_key_t key)
234 {
235 	return ((uint_t)(uintptr_t)key % rctl_dict_size);
236 }
237 
238 static int
239 rctl_dict_id_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
240 {
241 	uint_t u1 = (uint_t)(uintptr_t)key1;
242 	uint_t u2 = (uint_t)(uintptr_t)key2;
243 
244 	if (u1 > u2)
245 		return (1);
246 
247 	if (u1 == u2)
248 		return (0);
249 
250 	return (-1);
251 }
252 
253 static void
254 rctl_dict_val_dtor(mod_hash_val_t val)
255 {
256 	rctl_dict_entry_t *kr = (rctl_dict_entry_t *)val;
257 
258 	kmem_free(kr, sizeof (rctl_dict_entry_t));
259 }
260 
261 /*
262  * size_t rctl_build_name_buf()
263  *
264  * Overview
265  *   rctl_build_name_buf() walks all active resource controls in the dictionary,
266  *   building a buffer of continguous NUL-terminated strings.
267  *
268  * Return values
269  *   The size of the buffer is returned, the passed pointer's contents are
270  *   modified to that of the location of the buffer.
271  *
272  * Caller's context
273  *   Caller must be in a context suitable for KM_SLEEP allocations.
274  */
275 size_t
276 rctl_build_name_buf(char **rbufp)
277 {
278 	size_t req_size, cpy_size;
279 	char *rbufloc;
280 	int i;
281 
282 rctl_rebuild_name_buf:
283 	req_size = cpy_size = 0;
284 
285 	/*
286 	 * Calculate needed buffer length.
287 	 */
288 	mutex_enter(&rctl_lists_lock);
289 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
290 		rctl_dict_entry_t *rde;
291 
292 		for (rde = rctl_lists[i];
293 		    rde != NULL;
294 		    rde = rde->rcd_next)
295 			req_size += strlen(rde->rcd_name) + 1;
296 	}
297 	mutex_exit(&rctl_lists_lock);
298 
299 	rbufloc = *rbufp = kmem_alloc(req_size, KM_SLEEP);
300 
301 	/*
302 	 * Copy rctl names into our buffer.  If the copy length exceeds the
303 	 * allocate length (due to registration changes), stop copying, free the
304 	 * buffer, and start again.
305 	 */
306 	mutex_enter(&rctl_lists_lock);
307 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
308 		rctl_dict_entry_t *rde;
309 
310 		for (rde = rctl_lists[i];
311 		    rde != NULL;
312 		    rde = rde->rcd_next) {
313 			size_t length = strlen(rde->rcd_name) + 1;
314 
315 			cpy_size += length;
316 
317 			if (cpy_size > req_size) {
318 				kmem_free(*rbufp, req_size);
319 				mutex_exit(&rctl_lists_lock);
320 				goto rctl_rebuild_name_buf;
321 			}
322 
323 			bcopy(rde->rcd_name, rbufloc, length);
324 			rbufloc += length;
325 		}
326 	}
327 	mutex_exit(&rctl_lists_lock);
328 
329 	return (req_size);
330 }
331 
332 /*
333  * rctl_dict_entry_t *rctl_dict_lookup(const char *)
334  *
335  * Overview
336  *   rctl_dict_lookup() returns the resource control dictionary entry for the
337  *   named resource control.
338  *
339  * Return values
340  *   A pointer to the appropriate resource control dictionary entry, or NULL if
341  *   no such named entry exists.
342  *
343  * Caller's context
344  *   Caller must not be holding rctl_dict_lock.
345  */
346 rctl_dict_entry_t *
347 rctl_dict_lookup(const char *name)
348 {
349 	rctl_dict_entry_t *rde;
350 
351 	mutex_enter(&rctl_dict_lock);
352 
353 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
354 	    (mod_hash_val_t *)&rde) == MH_ERR_NOTFOUND) {
355 		mutex_exit(&rctl_dict_lock);
356 		return (NULL);
357 	}
358 
359 	mutex_exit(&rctl_dict_lock);
360 
361 	return (rde);
362 }
363 
364 /*
365  * rctl_hndl_t rctl_hndl_lookup(const char *)
366  *
367  * Overview
368  *   rctl_hndl_lookup() returns the resource control id (the "handle") for the
369  *   named resource control.
370  *
371  * Return values
372  *   The appropriate id, or -1 if no such named entry exists.
373  *
374  * Caller's context
375  *   Caller must not be holding rctl_dict_lock.
376  */
377 rctl_hndl_t
378 rctl_hndl_lookup(const char *name)
379 {
380 	rctl_dict_entry_t *rde;
381 
382 	if ((rde = rctl_dict_lookup(name)) == NULL)
383 		return (-1);
384 
385 	return (rde->rcd_id);
386 }
387 
388 /*
389  * rctl_dict_entry_t * rctl_dict_lookup_hndl(rctl_hndl_t)
390  *
391  * Overview
392  *   rctl_dict_lookup_hndl() completes the public lookup functions, by returning
393  *   the resource control dictionary entry matching a given resource control id.
394  *
395  * Return values
396  *   A pointer to the matching resource control dictionary entry, or NULL if the
397  *   id does not match any existing entries.
398  *
399  * Caller's context
400  *   Caller must not be holding rctl_lists_lock.
401  */
402 rctl_dict_entry_t *
403 rctl_dict_lookup_hndl(rctl_hndl_t hndl)
404 {
405 	uint_t i;
406 
407 	mutex_enter(&rctl_lists_lock);
408 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
409 		rctl_dict_entry_t *rde;
410 
411 		for (rde = rctl_lists[i];
412 		    rde != NULL;
413 		    rde = rde->rcd_next)
414 			if (rde->rcd_id == hndl) {
415 				mutex_exit(&rctl_lists_lock);
416 				return (rde);
417 			}
418 	}
419 	mutex_exit(&rctl_lists_lock);
420 
421 	return (NULL);
422 }
423 
424 /*
425  * void rctl_add_default_limit(const char *name, rctl_qty_t value,
426  *     rctl_priv_t privilege, uint_t action)
427  *
428  * Overview
429  *   Create a default limit with specified value, privilege, and action.
430  *
431  * Return value
432  *   No value returned.
433  */
434 void
435 rctl_add_default_limit(const char *name, rctl_qty_t value,
436     rctl_priv_t privilege, uint_t action)
437 {
438 	rctl_val_t *dval;
439 	rctl_dict_entry_t *rde;
440 
441 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
442 	bzero(dval, sizeof (rctl_val_t));
443 	dval->rcv_value = value;
444 	dval->rcv_privilege = privilege;
445 	dval->rcv_flagaction = action;
446 	dval->rcv_action_recip_pid = -1;
447 
448 	rde = rctl_dict_lookup(name);
449 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
450 }
451 
452 /*
453  * void rctl_add_legacy_limit(const char *name, const char *mname,
454  *     const char *lname, rctl_qty_t dflt)
455  *
456  * Overview
457  *   Create a default privileged limit, using the value obtained from
458  *   /etc/system if it exists and is greater than the specified default
459  *   value.  Exists primarily for System V IPC.
460  *
461  * Return value
462  *   No value returned.
463  */
464 void
465 rctl_add_legacy_limit(const char *name, const char *mname, const char *lname,
466     rctl_qty_t dflt, rctl_qty_t max)
467 {
468 	rctl_qty_t qty;
469 
470 	if (!mod_sysvar(mname, lname, &qty) || (qty < dflt))
471 		qty = dflt;
472 
473 	if (qty > max)
474 		qty = max;
475 
476 	rctl_add_default_limit(name, qty, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
477 }
478 
479 static rctl_set_t *
480 rctl_entity_obtain_rset(rctl_dict_entry_t *rcd, struct proc *p)
481 {
482 	rctl_set_t *rset = NULL;
483 
484 	if (rcd == NULL)
485 		return (NULL);
486 
487 	switch (rcd->rcd_entity) {
488 	case RCENTITY_PROCESS:
489 		rset = p->p_rctls;
490 		break;
491 	case RCENTITY_TASK:
492 		ASSERT(MUTEX_HELD(&p->p_lock));
493 		if (p->p_task != NULL)
494 			rset = p->p_task->tk_rctls;
495 		break;
496 	case RCENTITY_PROJECT:
497 		ASSERT(MUTEX_HELD(&p->p_lock));
498 		if (p->p_task != NULL &&
499 		    p->p_task->tk_proj != NULL)
500 			rset = p->p_task->tk_proj->kpj_rctls;
501 		break;
502 	case RCENTITY_ZONE:
503 		ASSERT(MUTEX_HELD(&p->p_lock));
504 		if (p->p_zone != NULL)
505 			rset = p->p_zone->zone_rctls;
506 		break;
507 	default:
508 		panic("unknown rctl entity type %d seen", rcd->rcd_entity);
509 		break;
510 	}
511 
512 	return (rset);
513 }
514 
515 static void
516 rctl_entity_obtain_entity_p(rctl_entity_t entity, struct proc *p,
517     rctl_entity_p_t *e)
518 {
519 	e->rcep_p.proc = NULL;
520 	e->rcep_t = entity;
521 
522 	switch (entity) {
523 	case RCENTITY_PROCESS:
524 		e->rcep_p.proc = p;
525 		break;
526 	case RCENTITY_TASK:
527 		ASSERT(MUTEX_HELD(&p->p_lock));
528 		if (p->p_task != NULL)
529 			e->rcep_p.task = p->p_task;
530 		break;
531 	case RCENTITY_PROJECT:
532 		ASSERT(MUTEX_HELD(&p->p_lock));
533 		if (p->p_task != NULL &&
534 		    p->p_task->tk_proj != NULL)
535 			e->rcep_p.proj = p->p_task->tk_proj;
536 		break;
537 	case RCENTITY_ZONE:
538 		ASSERT(MUTEX_HELD(&p->p_lock));
539 		if (p->p_zone != NULL)
540 			e->rcep_p.zone = p->p_zone;
541 		break;
542 	default:
543 		panic("unknown rctl entity type %d seen", entity);
544 		break;
545 	}
546 }
547 
548 static void
549 rctl_gp_alloc(rctl_alloc_gp_t *rcgp)
550 {
551 	uint_t i;
552 
553 	if (rcgp->rcag_nctls > 0) {
554 		rctl_t *prev = kmem_cache_alloc(rctl_cache, KM_SLEEP);
555 		rctl_t *rctl = prev;
556 
557 		rcgp->rcag_ctls = prev;
558 
559 		for (i = 1; i < rcgp->rcag_nctls; i++) {
560 			rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
561 			prev->rc_next = rctl;
562 			prev = rctl;
563 		}
564 
565 		rctl->rc_next = NULL;
566 	}
567 
568 	if (rcgp->rcag_nvals > 0) {
569 		rctl_val_t *prev = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
570 		rctl_val_t *rval = prev;
571 
572 		rcgp->rcag_vals = prev;
573 
574 		for (i = 1; i < rcgp->rcag_nvals; i++) {
575 			rval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
576 			prev->rcv_next = rval;
577 			prev = rval;
578 		}
579 
580 		rval->rcv_next = NULL;
581 	}
582 
583 }
584 
585 static rctl_val_t *
586 rctl_gp_detach_val(rctl_alloc_gp_t *rcgp)
587 {
588 	rctl_val_t *rval = rcgp->rcag_vals;
589 
590 	ASSERT(rcgp->rcag_nvals > 0);
591 	rcgp->rcag_nvals--;
592 	rcgp->rcag_vals = rval->rcv_next;
593 
594 	rval->rcv_next = NULL;
595 
596 	return (rval);
597 }
598 
599 static rctl_t *
600 rctl_gp_detach_ctl(rctl_alloc_gp_t *rcgp)
601 {
602 	rctl_t *rctl = rcgp->rcag_ctls;
603 
604 	ASSERT(rcgp->rcag_nctls > 0);
605 	rcgp->rcag_nctls--;
606 	rcgp->rcag_ctls = rctl->rc_next;
607 
608 	rctl->rc_next = NULL;
609 
610 	return (rctl);
611 
612 }
613 
614 static void
615 rctl_gp_free(rctl_alloc_gp_t *rcgp)
616 {
617 	rctl_val_t *rval = rcgp->rcag_vals;
618 	rctl_t *rctl = rcgp->rcag_ctls;
619 
620 	while (rval != NULL) {
621 		rctl_val_t *next = rval->rcv_next;
622 
623 		kmem_cache_free(rctl_val_cache, rval);
624 		rval = next;
625 	}
626 
627 	while (rctl != NULL) {
628 		rctl_t *next = rctl->rc_next;
629 
630 		kmem_cache_free(rctl_cache, rctl);
631 		rctl = next;
632 	}
633 }
634 
635 /*
636  * void rctl_prealloc_destroy(rctl_alloc_gp_t *)
637  *
638  * Overview
639  *   Release all unused memory allocated via one of the "prealloc" functions:
640  *   rctl_set_init_prealloc, rctl_set_dup_prealloc, or rctl_rlimit_set_prealloc.
641  *
642  * Return values
643  *   None.
644  *
645  * Caller's context
646  *   No restrictions on context.
647  */
648 void
649 rctl_prealloc_destroy(rctl_alloc_gp_t *gp)
650 {
651 	rctl_gp_free(gp);
652 	kmem_free(gp, sizeof (rctl_alloc_gp_t));
653 }
654 
655 /*
656  * int rctl_val_cmp(rctl_val_t *, rctl_val_t *, int)
657  *
658  * Overview
659  *   This function defines an ordering to rctl_val_t's in order to allow
660  *   for correct placement in value lists. When the imprecise flag is set,
661  *   the action recipient is ignored. This is to facilitate insert,
662  *   delete, and replace operations by rctlsys.
663  *
664  * Return values
665  *   0 if the val_t's are are considered identical
666  *   -1 if a is ordered lower than b
667  *   1 if a is lowered higher than b
668  *
669  * Caller's context
670  *   No restrictions on context.
671  */
672 int
673 rctl_val_cmp(rctl_val_t *a, rctl_val_t *b, int imprecise)
674 {
675 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) <
676 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
677 		return (-1);
678 
679 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) >
680 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
681 		return (1);
682 
683 	if (a->rcv_value < b->rcv_value)
684 		return (-1);
685 
686 	if (a->rcv_value > b->rcv_value)
687 		return (1);
688 
689 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) <
690 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
691 		return (-1);
692 
693 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) >
694 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
695 		return (1);
696 
697 	if (a->rcv_privilege < b->rcv_privilege)
698 		return (-1);
699 
700 	if (a->rcv_privilege > b->rcv_privilege)
701 		return (1);
702 
703 	if (imprecise)
704 		return (0);
705 
706 	if (a->rcv_action_recip_pid < b->rcv_action_recip_pid)
707 		return (-1);
708 
709 	if (a->rcv_action_recip_pid > b->rcv_action_recip_pid)
710 		return (1);
711 
712 	return (0);
713 }
714 
715 static rctl_val_t *
716 rctl_val_list_find(rctl_val_t **head, rctl_val_t *cval)
717 {
718 	rctl_val_t *rval = *head;
719 
720 	while (rval != NULL) {
721 		if (rctl_val_cmp(cval, rval, 0) == 0)
722 			return (rval);
723 
724 		rval = rval->rcv_next;
725 	}
726 
727 	return (NULL);
728 
729 }
730 
731 /*
732  * int rctl_val_list_insert(rctl_val_t **, rctl_val_t *)
733  *
734  * Overview
735  *   This function inserts the rctl_val_t into the value list provided.
736  *   The insert is always successful unless if the value is a duplicate
737  *   of one already in the list.
738  *
739  * Return values
740  *    1 if the value was a duplicate of an existing value in the list.
741  *    0 if the insert was successful.
742  */
743 int
744 rctl_val_list_insert(rctl_val_t **root, rctl_val_t *rval)
745 {
746 	rctl_val_t *prev;
747 	int equiv;
748 
749 	rval->rcv_next = NULL;
750 	rval->rcv_prev = NULL;
751 
752 	if (*root == NULL) {
753 		*root = rval;
754 		return (0);
755 	}
756 
757 	equiv = rctl_val_cmp(rval, *root, 0);
758 
759 	if (equiv == 0)
760 		return (1);
761 
762 	if (equiv < 0) {
763 		rval->rcv_next = *root;
764 		rval->rcv_next->rcv_prev = rval;
765 		*root = rval;
766 
767 		return (0);
768 	}
769 
770 	prev = *root;
771 	while (prev->rcv_next != NULL &&
772 	    (equiv = rctl_val_cmp(rval, prev->rcv_next, 0)) > 0) {
773 		prev = prev->rcv_next;
774 	}
775 
776 	if (equiv == 0)
777 		return (1);
778 
779 	rval->rcv_next = prev->rcv_next;
780 	if (rval->rcv_next != NULL)
781 		rval->rcv_next->rcv_prev = rval;
782 	prev->rcv_next = rval;
783 	rval->rcv_prev = prev;
784 
785 	return (0);
786 }
787 
788 static int
789 rctl_val_list_delete(rctl_val_t **root, rctl_val_t *rval)
790 {
791 	rctl_val_t *prev;
792 
793 	if (*root == NULL)
794 		return (-1);
795 
796 	prev = *root;
797 	if (rctl_val_cmp(rval, prev, 0) == 0) {
798 		*root = prev->rcv_next;
799 		(*root)->rcv_prev = NULL;
800 
801 		kmem_cache_free(rctl_val_cache, prev);
802 
803 		return (0);
804 	}
805 
806 	while (prev->rcv_next != NULL &&
807 	    rctl_val_cmp(rval, prev->rcv_next, 0) != 0) {
808 		prev = prev->rcv_next;
809 	}
810 
811 	if (prev->rcv_next == NULL) {
812 		/*
813 		 * If we navigate the entire list and cannot find a match, then
814 		 * return failure.
815 		 */
816 		return (-1);
817 	}
818 
819 	prev = prev->rcv_next;
820 	prev->rcv_prev->rcv_next = prev->rcv_next;
821 	if (prev->rcv_next != NULL)
822 		prev->rcv_next->rcv_prev = prev->rcv_prev;
823 
824 	kmem_cache_free(rctl_val_cache, prev);
825 
826 	return (0);
827 }
828 
829 static rctl_val_t *
830 rctl_val_list_dup(rctl_val_t *rval, rctl_alloc_gp_t *ragp, struct proc *oldp,
831     struct proc *newp)
832 {
833 	rctl_val_t *head = NULL;
834 
835 	for (; rval != NULL; rval = rval->rcv_next) {
836 		rctl_val_t *dval = rctl_gp_detach_val(ragp);
837 
838 		bcopy(rval, dval, sizeof (rctl_val_t));
839 		dval->rcv_prev = dval->rcv_next = NULL;
840 
841 		if (oldp == NULL ||
842 		    rval->rcv_action_recipient == NULL ||
843 		    rval->rcv_action_recipient == oldp) {
844 			if (rval->rcv_privilege == RCPRIV_BASIC) {
845 				dval->rcv_action_recipient = newp;
846 				dval->rcv_action_recip_pid = newp->p_pid;
847 			} else {
848 				dval->rcv_action_recipient = NULL;
849 				dval->rcv_action_recip_pid = -1;
850 			}
851 
852 			(void) rctl_val_list_insert(&head, dval);
853 		} else {
854 			kmem_cache_free(rctl_val_cache, dval);
855 		}
856 	}
857 
858 	return (head);
859 }
860 
861 static void
862 rctl_val_list_reset(rctl_val_t *rval)
863 {
864 	for (; rval != NULL; rval = rval->rcv_next)
865 		rval->rcv_firing_time = 0;
866 }
867 
868 static uint_t
869 rctl_val_list_count(rctl_val_t *rval)
870 {
871 	uint_t n = 0;
872 
873 	for (; rval != NULL; rval = rval->rcv_next)
874 		n++;
875 
876 	return (n);
877 }
878 
879 
880 static void
881 rctl_val_list_free(rctl_val_t *rval)
882 {
883 	while (rval != NULL) {
884 		rctl_val_t *next = rval->rcv_next;
885 
886 		kmem_cache_free(rctl_val_cache, rval);
887 
888 		rval = next;
889 	}
890 }
891 
892 /*
893  * rctl_qty_t rctl_model_maximum(rctl_dict_entry_t *, struct proc *)
894  *
895  * Overview
896  *   In cases where the operating system supports more than one process
897  *   addressing model, the operating system capabilities will exceed those of
898  *   one or more of these models.  Processes in a less capable model must have
899  *   their resources accurately controlled, without diluting those of their
900  *   descendants reached via exec().  rctl_model_maximum() returns the governing
901  *   value for the specified process with respect to a resource control, such
902  *   that the value can used for the RCTLOP_SET callback or compatability
903  *   support.
904  *
905  * Return values
906  *   The maximum value for the given process for the specified resource control.
907  *
908  * Caller's context
909  *   No restrictions on context.
910  */
911 rctl_qty_t
912 rctl_model_maximum(rctl_dict_entry_t *rde, struct proc *p)
913 {
914 	if (p->p_model == DATAMODEL_NATIVE)
915 		return (rde->rcd_max_native);
916 
917 	return (rde->rcd_max_ilp32);
918 }
919 
920 /*
921  * rctl_qty_t rctl_model_value(rctl_dict_entry_t *, struct proc *, rctl_qty_t)
922  *
923  * Overview
924  *   Convenience function wrapping the rctl_model_maximum() functionality.
925  *
926  * Return values
927  *   The lesser of the process's maximum value and the given value for the
928  *   specified resource control.
929  *
930  * Caller's context
931  *   No restrictions on context.
932  */
933 rctl_qty_t
934 rctl_model_value(rctl_dict_entry_t *rde, struct proc *p, rctl_qty_t value)
935 {
936 	rctl_qty_t max = rctl_model_maximum(rde, p);
937 
938 	return (value < max ? value : max);
939 }
940 
941 static void
942 rctl_set_insert(rctl_set_t *set, rctl_hndl_t hndl, rctl_t *rctl)
943 {
944 	uint_t index = hndl % rctl_set_size;
945 	rctl_t *next_ctl, *prev_ctl;
946 
947 	ASSERT(MUTEX_HELD(&set->rcs_lock));
948 
949 	rctl->rc_next = NULL;
950 
951 	if (set->rcs_ctls[index] == NULL) {
952 		set->rcs_ctls[index] = rctl;
953 		return;
954 	}
955 
956 	if (hndl < set->rcs_ctls[index]->rc_id) {
957 		rctl->rc_next = set->rcs_ctls[index];
958 		set->rcs_ctls[index] = rctl;
959 
960 		return;
961 	}
962 
963 	for (next_ctl = set->rcs_ctls[index]->rc_next,
964 	    prev_ctl = set->rcs_ctls[index];
965 	    next_ctl != NULL;
966 	    prev_ctl = next_ctl,
967 	    next_ctl = next_ctl->rc_next) {
968 		if (next_ctl->rc_id > hndl) {
969 			rctl->rc_next = next_ctl;
970 			prev_ctl->rc_next = rctl;
971 
972 			return;
973 		}
974 	}
975 
976 	rctl->rc_next = next_ctl;
977 	prev_ctl->rc_next = rctl;
978 }
979 
980 /*
981  * rctl_set_t *rctl_set_create()
982  *
983  * Overview
984  *   Create an empty resource control set, suitable for attaching to a
985  *   controlled entity.
986  *
987  * Return values
988  *   A pointer to the newly created set.
989  *
990  * Caller's context
991  *   Safe for KM_SLEEP allocations.
992  */
993 rctl_set_t *
994 rctl_set_create()
995 {
996 	rctl_set_t *rset = kmem_zalloc(sizeof (rctl_set_t), KM_SLEEP);
997 
998 	mutex_init(&rset->rcs_lock, NULL, MUTEX_DEFAULT, NULL);
999 	rset->rcs_ctls = kmem_zalloc(rctl_set_size * sizeof (rctl_t *),
1000 	    KM_SLEEP);
1001 	rset->rcs_entity = -1;
1002 
1003 	return (rset);
1004 }
1005 
1006 /*
1007  * rctl_gp_alloc_t *rctl_set_init_prealloc(rctl_entity_t)
1008  *
1009  * Overview
1010  *    rctl_set_init_prealloc() examines the globally defined resource controls
1011  *    and their default values and returns a resource control allocation group
1012  *    populated with sufficient controls and values to form a representative
1013  *    resource control set for the specified entity.
1014  *
1015  * Return values
1016  *    A pointer to the newly created allocation group.
1017  *
1018  * Caller's context
1019  *    Caller must be in a context suitable for KM_SLEEP allocations.
1020  */
1021 rctl_alloc_gp_t *
1022 rctl_set_init_prealloc(rctl_entity_t entity)
1023 {
1024 	rctl_dict_entry_t *rde;
1025 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1026 
1027 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1028 
1029 	if (rctl_lists[entity] == NULL)
1030 		return (ragp);
1031 
1032 	mutex_enter(&rctl_lists_lock);
1033 
1034 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1035 		ragp->rcag_nctls++;
1036 		ragp->rcag_nvals += rctl_val_list_count(rde->rcd_default_value);
1037 	}
1038 
1039 	mutex_exit(&rctl_lists_lock);
1040 
1041 	rctl_gp_alloc(ragp);
1042 
1043 	return (ragp);
1044 }
1045 
1046 /*
1047  * rctl_set_t *rctl_set_init(rctl_entity_t)
1048  *
1049  * Overview
1050  *   rctl_set_create() creates a resource control set, initialized with the
1051  *   system infinite values on all registered controls, for attachment to a
1052  *   system entity requiring resource controls, such as a process or a task.
1053  *
1054  * Return values
1055  *   A pointer to the newly filled set.
1056  *
1057  * Caller's context
1058  *   Caller must be holding p_lock on entry so that RCTLOP_SET() functions
1059  *   may modify task and project members based on the proc structure
1060  *   they are passed.
1061  */
1062 rctl_set_t *
1063 rctl_set_init(rctl_entity_t entity, struct proc *p, rctl_entity_p_t *e,
1064     rctl_set_t *rset, rctl_alloc_gp_t *ragp)
1065 {
1066 	rctl_dict_entry_t *rde;
1067 
1068 	ASSERT(MUTEX_HELD(&p->p_lock));
1069 	ASSERT(e);
1070 	rset->rcs_entity = entity;
1071 
1072 	if (rctl_lists[entity] == NULL)
1073 		return (rset);
1074 
1075 	mutex_enter(&rctl_lists_lock);
1076 	mutex_enter(&rset->rcs_lock);
1077 
1078 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1079 		rctl_t *rctl = rctl_gp_detach_ctl(ragp);
1080 
1081 		rctl->rc_dict_entry = rde;
1082 		rctl->rc_id = rde->rcd_id;
1083 
1084 		rctl->rc_values = rctl_val_list_dup(rde->rcd_default_value,
1085 		    ragp, NULL, p);
1086 		rctl->rc_cursor = rctl->rc_values;
1087 
1088 		ASSERT(rctl->rc_cursor != NULL);
1089 
1090 		rctl_set_insert(rset, rde->rcd_id, rctl);
1091 
1092 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1093 		    rctl->rc_cursor->rcv_value));
1094 	}
1095 
1096 	mutex_exit(&rset->rcs_lock);
1097 	mutex_exit(&rctl_lists_lock);
1098 
1099 	return (rset);
1100 }
1101 
1102 static rctl_t *
1103 rctl_dup(rctl_t *rctl, rctl_alloc_gp_t *ragp, struct proc *oldp,
1104     struct proc *newp)
1105 {
1106 	rctl_t *dup = rctl_gp_detach_ctl(ragp);
1107 	rctl_val_t *dval;
1108 
1109 	dup->rc_id = rctl->rc_id;
1110 	dup->rc_dict_entry = rctl->rc_dict_entry;
1111 	dup->rc_next = NULL;
1112 	dup->rc_cursor = NULL;
1113 	dup->rc_values = rctl_val_list_dup(rctl->rc_values, ragp, oldp, newp);
1114 
1115 	for (dval = dup->rc_values;
1116 	    dval != NULL; dval = dval->rcv_next) {
1117 		if (rctl_val_cmp(rctl->rc_cursor, dval, 0) >= 0) {
1118 			dup->rc_cursor = dval;
1119 			break;
1120 		}
1121 	}
1122 
1123 	if (dup->rc_cursor == NULL)
1124 		dup->rc_cursor = dup->rc_values;
1125 
1126 	return (dup);
1127 }
1128 
1129 static void
1130 rctl_set_fill_alloc_gp(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1131 {
1132 	uint_t i;
1133 
1134 	bzero(ragp, sizeof (rctl_alloc_gp_t));
1135 
1136 	for (i = 0; i < rctl_set_size; i++) {
1137 		rctl_t *r = set->rcs_ctls[i];
1138 
1139 		while (r != NULL) {
1140 			ragp->rcag_nctls++;
1141 
1142 			ragp->rcag_nvals += rctl_val_list_count(r->rc_values);
1143 
1144 			r = r->rc_next;
1145 		}
1146 	}
1147 }
1148 
1149 /*
1150  * rctl_alloc_gp_t *rctl_set_dup_prealloc(rctl_set_t *)
1151  *
1152  * Overview
1153  *   Given a resource control set, allocate a sufficiently large allocation
1154  *   group to contain a duplicate of the set.
1155  *
1156  * Return value
1157  *   A pointer to the newly created allocation group.
1158  *
1159  * Caller's context
1160  *   Safe for KM_SLEEP allocations.
1161  */
1162 rctl_alloc_gp_t *
1163 rctl_set_dup_prealloc(rctl_set_t *set)
1164 {
1165 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1166 
1167 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1168 
1169 	mutex_enter(&set->rcs_lock);
1170 	rctl_set_fill_alloc_gp(set, ragp);
1171 	mutex_exit(&set->rcs_lock);
1172 
1173 	rctl_gp_alloc(ragp);
1174 
1175 	return (ragp);
1176 }
1177 
1178 /*
1179  * int rctl_set_dup_ready(rctl_set_t *, rctl_alloc_gp_t *)
1180  *
1181  * Overview
1182  *   Verify that the allocation group provided is large enough to allow a
1183  *   duplicate of the given resource control set to be constructed from its
1184  *   contents.
1185  *
1186  * Return values
1187  *   1 if the allocation group is sufficiently large, 0 otherwise.
1188  *
1189  * Caller's context
1190  *   rcs_lock must be held prior to entry.
1191  */
1192 int
1193 rctl_set_dup_ready(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1194 {
1195 	rctl_alloc_gp_t curr_gp;
1196 
1197 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1198 
1199 	rctl_set_fill_alloc_gp(set, &curr_gp);
1200 
1201 	if (curr_gp.rcag_nctls <= ragp->rcag_nctls &&
1202 	    curr_gp.rcag_nvals <= ragp->rcag_nvals)
1203 		return (1);
1204 
1205 	return (0);
1206 }
1207 
1208 /*
1209  * rctl_set_t *rctl_set_dup(rctl_set_t *, struct proc *, struct proc *,
1210  *   rctl_set_t *, rctl_alloc_gp_t *, int)
1211  *
1212  * Overview
1213  *   Make a duplicate of the resource control set.  The proc pointers are those
1214  *   of the owning process and of the process associated with the entity
1215  *   receiving the duplicate.
1216  *
1217  *   Duplication is a 3 stage process. Stage 1 is memory allocation for
1218  *   the duplicate set, which is taken care of by rctl_set_dup_prealloc().
1219  *   Stage 2 consists of copying all rctls and values from the old set into
1220  *   the new. Stage 3 completes the duplication by performing the appropriate
1221  *   callbacks for each rctl in the new set.
1222  *
1223  *   Stages 2 and 3 are handled by calling rctl_set_dup with the RCD_DUP and
1224  *   RCD_CALLBACK functions, respectively. The RCD_CALLBACK flag may only
1225  *   be supplied if the newp proc structure reflects the new task and
1226  *   project linkage.
1227  *
1228  * Return value
1229  *   A pointer to the duplicate set.
1230  *
1231  * Caller's context
1232  *   The rcs_lock of the set to be duplicated must be held prior to entry.
1233  */
1234 rctl_set_t *
1235 rctl_set_dup(rctl_set_t *set, struct proc *oldp, struct proc *newp,
1236     rctl_entity_p_t *e, rctl_set_t *dup, rctl_alloc_gp_t *ragp, int flag)
1237 {
1238 	uint_t i;
1239 	rctl_set_t	*iter;
1240 
1241 	ASSERT((flag & RCD_DUP) || (flag & RCD_CALLBACK));
1242 	ASSERT(e);
1243 	/*
1244 	 * When copying the old set, iterate over that. Otherwise, when
1245 	 * only callbacks have been requested, iterate over the dup set.
1246 	 */
1247 	if (flag & RCD_DUP) {
1248 		ASSERT(MUTEX_HELD(&set->rcs_lock));
1249 		iter = set;
1250 		dup->rcs_entity = set->rcs_entity;
1251 	} else {
1252 		iter = dup;
1253 	}
1254 
1255 	mutex_enter(&dup->rcs_lock);
1256 
1257 	for (i = 0; i < rctl_set_size; i++) {
1258 		rctl_t *r = iter->rcs_ctls[i];
1259 		rctl_t *d;
1260 
1261 		while (r != NULL) {
1262 			if (flag & RCD_DUP) {
1263 				d = rctl_dup(r, ragp, oldp, newp);
1264 				rctl_set_insert(dup, r->rc_id, d);
1265 			} else {
1266 				d = r;
1267 			}
1268 
1269 			if (flag & RCD_CALLBACK)
1270 				RCTLOP_SET(d, newp, e,
1271 				    rctl_model_value(d->rc_dict_entry, newp,
1272 				    d->rc_cursor->rcv_value));
1273 
1274 			r = r->rc_next;
1275 		}
1276 	}
1277 
1278 	mutex_exit(&dup->rcs_lock);
1279 
1280 	return (dup);
1281 }
1282 
1283 /*
1284  * void rctl_set_free(rctl_set_t *)
1285  *
1286  * Overview
1287  *   Delete resource control set and all attached values.
1288  *
1289  * Return values
1290  *   No value returned.
1291  *
1292  * Caller's context
1293  *   No restrictions on context.
1294  */
1295 void
1296 rctl_set_free(rctl_set_t *set)
1297 {
1298 	uint_t i;
1299 
1300 	mutex_enter(&set->rcs_lock);
1301 	for (i = 0; i < rctl_set_size; i++) {
1302 		rctl_t *r = set->rcs_ctls[i];
1303 
1304 		while (r != NULL) {
1305 			rctl_val_t *v = r->rc_values;
1306 			rctl_t *n = r->rc_next;
1307 
1308 			kmem_cache_free(rctl_cache, r);
1309 
1310 			rctl_val_list_free(v);
1311 
1312 			r = n;
1313 		}
1314 	}
1315 	mutex_exit(&set->rcs_lock);
1316 
1317 	kmem_free(set->rcs_ctls, sizeof (rctl_t *) * rctl_set_size);
1318 	kmem_free(set, sizeof (rctl_set_t));
1319 }
1320 
1321 /*
1322  * void rctl_set_reset(rctl_set_t *)
1323  *
1324  * Overview
1325  *   Resets all rctls within the set such that the lowest value becomes active.
1326  *
1327  * Return values
1328  *   No value returned.
1329  *
1330  * Caller's context
1331  *   No restrictions on context.
1332  */
1333 void
1334 rctl_set_reset(rctl_set_t *set, struct proc *p, rctl_entity_p_t *e)
1335 {
1336 	uint_t i;
1337 
1338 	ASSERT(e);
1339 
1340 	mutex_enter(&set->rcs_lock);
1341 	for (i = 0; i < rctl_set_size; i++) {
1342 		rctl_t *r = set->rcs_ctls[i];
1343 
1344 		while (r != NULL) {
1345 			r->rc_cursor = r->rc_values;
1346 			rctl_val_list_reset(r->rc_cursor);
1347 			RCTLOP_SET(r, p, e, rctl_model_value(r->rc_dict_entry,
1348 			    p, r->rc_cursor->rcv_value));
1349 
1350 			ASSERT(r->rc_cursor != NULL);
1351 
1352 			r = r->rc_next;
1353 		}
1354 	}
1355 
1356 	mutex_exit(&set->rcs_lock);
1357 }
1358 
1359 /*
1360  * void rctl_set_tearoff(rctl_set *, struct proc *)
1361  *
1362  * Overview
1363  *   Tear off any resource control values on this set with an action recipient
1364  *   equal to the specified process (as they are becoming invalid with the
1365  *   process's departure from this set as an observer).
1366  *
1367  * Return values
1368  *   No value returned.
1369  *
1370  * Caller's context
1371  *   No restrictions on context
1372  */
1373 void
1374 rctl_set_tearoff(rctl_set_t *set, struct proc *p)
1375 {
1376 	uint_t i;
1377 
1378 	mutex_enter(&set->rcs_lock);
1379 	for (i = 0; i < rctl_set_size; i++) {
1380 		rctl_t *r = set->rcs_ctls[i];
1381 
1382 		while (r != NULL) {
1383 			rctl_val_t *rval;
1384 
1385 tearoff_rewalk_list:
1386 			rval = r->rc_values;
1387 
1388 			while (rval != NULL) {
1389 				if (rval->rcv_privilege == RCPRIV_BASIC &&
1390 				    rval->rcv_action_recipient == p) {
1391 					if (r->rc_cursor == rval)
1392 						r->rc_cursor = rval->rcv_next;
1393 
1394 					(void) rctl_val_list_delete(
1395 					    &r->rc_values, rval);
1396 
1397 					goto tearoff_rewalk_list;
1398 				}
1399 
1400 				rval = rval->rcv_next;
1401 			}
1402 
1403 			ASSERT(r->rc_cursor != NULL);
1404 
1405 			r = r->rc_next;
1406 		}
1407 	}
1408 
1409 	mutex_exit(&set->rcs_lock);
1410 }
1411 
1412 static int
1413 rctl_set_find(rctl_set_t *set, rctl_hndl_t hndl, rctl_t **rctl)
1414 {
1415 	uint_t index = hndl % rctl_set_size;
1416 	rctl_t *curr_ctl;
1417 
1418 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1419 
1420 	for (curr_ctl = set->rcs_ctls[index]; curr_ctl != NULL;
1421 	    curr_ctl = curr_ctl->rc_next) {
1422 		if (curr_ctl->rc_id == hndl) {
1423 			*rctl = curr_ctl;
1424 
1425 			return (0);
1426 		}
1427 	}
1428 
1429 	return (-1);
1430 }
1431 
1432 /*
1433  * rlim64_t rctl_enforced_value(rctl_hndl_t, rctl_set_t *, struct proc *)
1434  *
1435  * Overview
1436  *   Given a process, get the next enforced value on the rctl of the specified
1437  *   handle.
1438  *
1439  * Return value
1440  *   The enforced value.
1441  *
1442  * Caller's context
1443  *   For controls on process collectives, p->p_lock must be held across the
1444  *   operation.
1445  */
1446 /*ARGSUSED*/
1447 rctl_qty_t
1448 rctl_enforced_value(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p)
1449 {
1450 	rctl_t *rctl;
1451 	rlim64_t ret;
1452 
1453 	mutex_enter(&rset->rcs_lock);
1454 
1455 	if (rctl_set_find(rset, hndl, &rctl) == -1)
1456 		panic("unknown resource control handle %d requested", hndl);
1457 	else
1458 		ret = rctl_model_value(rctl->rc_dict_entry, p,
1459 		    rctl->rc_cursor->rcv_value);
1460 
1461 	mutex_exit(&rset->rcs_lock);
1462 
1463 	return (ret);
1464 }
1465 
1466 /*
1467  * int rctl_global_get(const char *, rctl_dict_entry_t *)
1468  *
1469  * Overview
1470  *   Copy a sanitized version of the global rctl for a given resource control
1471  *   name.  (By sanitization, we mean that the unsafe data pointers have been
1472  *   zeroed.)
1473  *
1474  * Return value
1475  *   -1 if name not defined, 0 otherwise.
1476  *
1477  * Caller's context
1478  *   No restrictions on context.  rctl_dict_lock must not be held.
1479  */
1480 int
1481 rctl_global_get(const char *name, rctl_dict_entry_t *drde)
1482 {
1483 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1484 
1485 	if (rde == NULL)
1486 		return (-1);
1487 
1488 	bcopy(rde, drde, sizeof (rctl_dict_entry_t));
1489 
1490 	drde->rcd_next = NULL;
1491 	drde->rcd_ops = NULL;
1492 
1493 	return (0);
1494 }
1495 
1496 /*
1497  * int rctl_global_set(const char *, rctl_dict_entry_t *)
1498  *
1499  * Overview
1500  *   Transfer the settable fields of the named rctl to the global rctl matching
1501  *   the given resource control name.
1502  *
1503  * Return value
1504  *   -1 if name not defined, 0 otherwise.
1505  *
1506  * Caller's context
1507  *   No restrictions on context.  rctl_dict_lock must not be held.
1508  */
1509 int
1510 rctl_global_set(const char *name, rctl_dict_entry_t *drde)
1511 {
1512 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1513 
1514 	if (rde == NULL)
1515 		return (-1);
1516 
1517 	rde->rcd_flagaction = drde->rcd_flagaction;
1518 	rde->rcd_syslog_level = drde->rcd_syslog_level;
1519 	rde->rcd_strlog_flags = drde->rcd_strlog_flags;
1520 
1521 	return (0);
1522 }
1523 
1524 static int
1525 rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1526     int (*cbop)(rctl_hndl_t, struct proc *p, rctl_entity_p_t *e, rctl_t *,
1527     rctl_val_t *, rctl_val_t *), struct proc *p)
1528 {
1529 	rctl_t *rctl;
1530 	rctl_set_t *rset;
1531 	rctl_entity_p_t e;
1532 	int ret = 0;
1533 	rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
1534 
1535 local_op_retry:
1536 
1537 	ASSERT(MUTEX_HELD(&p->p_lock));
1538 
1539 	rset = rctl_entity_obtain_rset(rde, p);
1540 
1541 	if (rset == NULL) {
1542 		return (-1);
1543 	}
1544 	rctl_entity_obtain_entity_p(rset->rcs_entity, p, &e);
1545 
1546 	mutex_enter(&rset->rcs_lock);
1547 
1548 	/* using rctl's hndl, get rctl from local set */
1549 	if (rctl_set_find(rset, hndl, &rctl) == -1) {
1550 		mutex_exit(&rset->rcs_lock);
1551 		return (-1);
1552 	}
1553 
1554 	ret = cbop(hndl, p, &e, rctl, oval, nval);
1555 
1556 	mutex_exit(&rset->rcs_lock);
1557 	return (ret);
1558 }
1559 
1560 /*ARGSUSED*/
1561 static int
1562 rctl_local_get_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1563     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1564 {
1565 	if (oval == NULL) {
1566 		/*
1567 		 * RCTL_FIRST
1568 		 */
1569 		bcopy(rctl->rc_values, nval, sizeof (rctl_val_t));
1570 	} else {
1571 		/*
1572 		 * RCTL_NEXT
1573 		 */
1574 		rctl_val_t *tval = rctl_val_list_find(&rctl->rc_values, oval);
1575 
1576 		if (tval == NULL)
1577 			return (ESRCH);
1578 		else if (tval->rcv_next == NULL)
1579 			return (ENOENT);
1580 		else
1581 			bcopy(tval->rcv_next, nval, sizeof (rctl_val_t));
1582 	}
1583 
1584 	return (0);
1585 }
1586 
1587 /*
1588  * int rctl_local_get(rctl_hndl_t, rctl_val_t *)
1589  *
1590  * Overview
1591  *   Get the rctl value for the given flags.
1592  *
1593  * Return values
1594  *   0 for successful get, errno otherwise.
1595  */
1596 int
1597 rctl_local_get(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1598     struct proc *p)
1599 {
1600 	return (rctl_local_op(hndl, oval, nval, rctl_local_get_cb, p));
1601 }
1602 
1603 /*ARGSUSED*/
1604 static int
1605 rctl_local_delete_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1606     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1607 {
1608 	if ((oval = rctl_val_list_find(&rctl->rc_values, nval)) == NULL)
1609 		return (ESRCH);
1610 
1611 	if (rctl->rc_cursor == oval) {
1612 		rctl->rc_cursor = oval->rcv_next;
1613 		rctl_val_list_reset(rctl->rc_cursor);
1614 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1615 		    rctl->rc_cursor->rcv_value));
1616 
1617 		ASSERT(rctl->rc_cursor != NULL);
1618 	}
1619 
1620 	(void) rctl_val_list_delete(&rctl->rc_values, oval);
1621 
1622 	return (0);
1623 }
1624 
1625 /*
1626  * int rctl_local_delete(rctl_hndl_t, rctl_val_t *)
1627  *
1628  * Overview
1629  *   Delete the rctl value for the given flags.
1630  *
1631  * Return values
1632  *   0 for successful delete, errno otherwise.
1633  */
1634 int
1635 rctl_local_delete(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1636 {
1637 	return (rctl_local_op(hndl, NULL, val, rctl_local_delete_cb, p));
1638 }
1639 
1640 /*
1641  * rctl_local_insert_cb()
1642  *
1643  * Overview
1644  *   Insert a new value into the rctl's val list. If an error occurs,
1645  *   the val list must be left in the same state as when the function
1646  *   was entered.
1647  *
1648  * Return Values
1649  *   0 for successful insert, EINVAL if the value is duplicated in the
1650  *   existing list.
1651  */
1652 /*ARGSUSED*/
1653 static int
1654 rctl_local_insert_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1655     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1656 {
1657 	/*
1658 	 * Before inserting, confirm there are no duplicates of this value
1659 	 * and flag level. If there is a duplicate, flag an error and do
1660 	 * nothing.
1661 	 */
1662 	if (rctl_val_list_insert(&rctl->rc_values, nval) != 0)
1663 		return (EINVAL);
1664 
1665 	if (rctl_val_cmp(nval, rctl->rc_cursor, 0) < 0) {
1666 		rctl->rc_cursor = nval;
1667 		rctl_val_list_reset(rctl->rc_cursor);
1668 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1669 		    rctl->rc_cursor->rcv_value));
1670 
1671 		ASSERT(rctl->rc_cursor != NULL);
1672 	}
1673 
1674 	return (0);
1675 }
1676 
1677 /*
1678  * int rctl_local_insert(rctl_hndl_t, rctl_val_t *)
1679  *
1680  * Overview
1681  *   Insert the rctl value into the appropriate rctl set for the calling
1682  *   process, given the handle.
1683  */
1684 int
1685 rctl_local_insert(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1686 {
1687 	return (rctl_local_op(hndl, NULL, val, rctl_local_insert_cb, p));
1688 }
1689 
1690 static int
1691 rctl_local_replace_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1692     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1693 {
1694 	int ret;
1695 
1696 	/*
1697 	 * rctl_local_insert_cb() does the job of flagging an error
1698 	 * for any duplicate values. So, call rctl_local_insert_cb()
1699 	 * for the new value first, then do deletion of the old value.
1700 	 * Since this is a callback function to rctl_local_op, we can
1701 	 * count on rcs_lock being held at this point. This guarantees
1702 	 * that there is at no point a visible list which contains both
1703 	 * new and old values.
1704 	 */
1705 	if (ret = rctl_local_insert_cb(hndl, p, e, rctl, NULL, nval))
1706 		return (ret);
1707 
1708 	return (rctl_local_delete_cb(hndl, p, e, rctl, NULL, oval));
1709 }
1710 
1711 /*
1712  * int rctl_local_replace(rctl_hndl_t, void *, int, uint64_t *)
1713  *
1714  * Overview
1715  *   Replace the rctl value with a new one.
1716  *
1717  * Return values
1718  *   0 for successful replace, errno otherwise.
1719  */
1720 int
1721 rctl_local_replace(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1722     struct proc *p)
1723 {
1724 	return (rctl_local_op(hndl, oval, nval, rctl_local_replace_cb, p));
1725 }
1726 
1727 /*
1728  * int rctl_rlimit_get(rctl_hndl_t, struct proc *, struct rlimit64 *)
1729  *
1730  * Overview
1731  *   To support rlimit compatibility, we need a function which takes a 64-bit
1732  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
1733  *   This operation is only intended for legacy rlimits.
1734  */
1735 int
1736 rctl_rlimit_get(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64)
1737 {
1738 	rctl_t *rctl;
1739 	rctl_val_t *rval;
1740 	rctl_set_t *rset = p->p_rctls;
1741 	int soft_limit_seen = 0;
1742 	int test_for_deny = 1;
1743 
1744 	mutex_enter(&rset->rcs_lock);
1745 	if (rctl_set_find(rset, rc, &rctl) == -1) {
1746 		mutex_exit(&rset->rcs_lock);
1747 		return (-1);
1748 	}
1749 
1750 	rval = rctl->rc_values;
1751 
1752 	if (rctl->rc_dict_entry->rcd_flagaction & (RCTL_GLOBAL_DENY_NEVER |
1753 	    RCTL_GLOBAL_DENY_ALWAYS))
1754 		test_for_deny = 0;
1755 
1756 	/*
1757 	 * 1.  Find the first control value with the RCTL_LOCAL_DENY bit set.
1758 	 */
1759 	while (rval != NULL && rval->rcv_privilege != RCPRIV_SYSTEM) {
1760 		if (test_for_deny &&
1761 		    (rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0) {
1762 			rval = rval->rcv_next;
1763 			continue;
1764 		}
1765 
1766 		/*
1767 		 * 2.  If this is an RCPRIV_BASIC value, then we've found the
1768 		 * effective soft limit and should set rlim_cur.  We should then
1769 		 * continue looking for another control value with the DENY bit
1770 		 * set.
1771 		 */
1772 		if (rval->rcv_privilege == RCPRIV_BASIC) {
1773 			if (soft_limit_seen) {
1774 				rval = rval->rcv_next;
1775 				continue;
1776 			}
1777 
1778 			if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
1779 			    rval->rcv_value < rctl_model_maximum(
1780 			    rctl->rc_dict_entry, p))
1781 				rlp64->rlim_cur = rval->rcv_value;
1782 			else
1783 				rlp64->rlim_cur = RLIM64_INFINITY;
1784 			soft_limit_seen = 1;
1785 
1786 			rval = rval->rcv_next;
1787 			continue;
1788 		}
1789 
1790 		/*
1791 		 * 3.  This is an RCPRIV_PRIVILEGED value.  If we haven't found
1792 		 * a soft limit candidate, then we've found the effective hard
1793 		 * and soft limits and should set both  If we had found a soft
1794 		 * limit, then this is only the hard limit and we need only set
1795 		 * rlim_max.
1796 		 */
1797 		if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
1798 		    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry,
1799 		    p))
1800 			rlp64->rlim_max = rval->rcv_value;
1801 		else
1802 			rlp64->rlim_max = RLIM64_INFINITY;
1803 		if (!soft_limit_seen)
1804 			rlp64->rlim_cur = rlp64->rlim_max;
1805 
1806 		mutex_exit(&rset->rcs_lock);
1807 		return (0);
1808 	}
1809 
1810 	if (rval == NULL) {
1811 		/*
1812 		 * This control sequence is corrupt, as it is not terminated by
1813 		 * a system privileged control value.
1814 		 */
1815 		mutex_exit(&rset->rcs_lock);
1816 		return (-1);
1817 	}
1818 
1819 	/*
1820 	 * 4.  If we run into a RCPRIV_SYSTEM value, then the hard limit (and
1821 	 * the soft, if we haven't a soft candidate) should be the value of the
1822 	 * system control value.
1823 	 */
1824 	if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
1825 	    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry, p))
1826 		rlp64->rlim_max = rval->rcv_value;
1827 	else
1828 		rlp64->rlim_max = RLIM64_INFINITY;
1829 
1830 	if (!soft_limit_seen)
1831 		rlp64->rlim_cur = rlp64->rlim_max;
1832 
1833 	mutex_exit(&rset->rcs_lock);
1834 	return (0);
1835 }
1836 
1837 /*
1838  * rctl_alloc_gp_t *rctl_rlimit_set_prealloc(uint_t)
1839  *
1840  * Overview
1841  *   Before making a series of calls to rctl_rlimit_set(), we must have a
1842  *   preallocated batch of resource control values, as rctl_rlimit_set() can
1843  *   potentially consume two resource control values per call.
1844  *
1845  * Return values
1846  *   A populated resource control allocation group with 2n resource control
1847  *   values.
1848  *
1849  * Caller's context
1850  *   Must be safe for KM_SLEEP allocations.
1851  */
1852 rctl_alloc_gp_t *
1853 rctl_rlimit_set_prealloc(uint_t n)
1854 {
1855 	rctl_alloc_gp_t *gp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1856 
1857 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1858 
1859 	gp->rcag_nvals = 2 * n;
1860 
1861 	rctl_gp_alloc(gp);
1862 
1863 	return (gp);
1864 }
1865 
1866 /*
1867  * int rctl_rlimit_set(rctl_hndl_t, struct proc *, struct rlimit64 *, int,
1868  *   int)
1869  *
1870  * Overview
1871  *   To support rlimit compatibility, we need a function which takes a 64-bit
1872  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
1873  *   This operation is only intended for legacy rlimits.
1874  *
1875  *   The implementation of rctl_rlimit_set() is a bit clever, as it tries to
1876  *   minimize the number of values placed on the value sequence in various
1877  *   cases.  Furthermore, we don't allow multiple identical privilege-action
1878  *   values on the same sequence.  (That is, we don't want a sequence like
1879  *   "while (1) { rlim.rlim_cur++; setrlimit(..., rlim); }" to exhaust kernel
1880  *   memory.)  So we want to delete any values with the same privilege value and
1881  *   action.
1882  *
1883  * Return values
1884  *   0 for successful set, errno otherwise. Errno will be either EINVAL
1885  *   or EPERM, in keeping with defined errnos for ulimit() and setrlimit()
1886  *   system calls.
1887  */
1888 /*ARGSUSED*/
1889 int
1890 rctl_rlimit_set(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64,
1891     rctl_alloc_gp_t *ragp, int flagaction, int signal, const cred_t *cr)
1892 {
1893 	rctl_t *rctl;
1894 	rctl_val_t *rval, *rval_priv, *rval_basic;
1895 	rctl_set_t *rset = p->p_rctls;
1896 	rctl_qty_t max;
1897 	rctl_entity_p_t e;
1898 	struct rlimit64 cur_rl;
1899 
1900 	e.rcep_t = RCENTITY_PROCESS;
1901 	e.rcep_p.proc = p;
1902 
1903 	if (rlp64->rlim_cur > rlp64->rlim_max)
1904 		return (EINVAL);
1905 
1906 	if (rctl_rlimit_get(rc, p, &cur_rl) == -1)
1907 		return (EINVAL);
1908 
1909 	/*
1910 	 * If we are not privileged, we can only lower the hard limit.
1911 	 */
1912 	if ((rlp64->rlim_max > cur_rl.rlim_max) &&
1913 	    cur_rl.rlim_max != RLIM64_INFINITY &&
1914 	    secpolicy_resource(cr) != 0)
1915 		return (EPERM);
1916 
1917 	mutex_enter(&rset->rcs_lock);
1918 
1919 	if (rctl_set_find(rset, rc, &rctl) == -1) {
1920 		mutex_exit(&rset->rcs_lock);
1921 		return (EINVAL);
1922 	}
1923 
1924 	rval_priv = rctl_gp_detach_val(ragp);
1925 
1926 	rval = rctl->rc_values;
1927 
1928 	while (rval != NULL) {
1929 		rctl_val_t *next = rval->rcv_next;
1930 
1931 		if (rval->rcv_privilege == RCPRIV_SYSTEM)
1932 			break;
1933 
1934 		if ((rval->rcv_privilege == RCPRIV_BASIC) ||
1935 		    (rval->rcv_flagaction & ~RCTL_LOCAL_ACTION_MASK) ==
1936 		    (flagaction & ~RCTL_LOCAL_ACTION_MASK)) {
1937 			if (rctl->rc_cursor == rval) {
1938 				rctl->rc_cursor = rval->rcv_next;
1939 				rctl_val_list_reset(rctl->rc_cursor);
1940 				RCTLOP_SET(rctl, p, &e, rctl_model_value(
1941 				    rctl->rc_dict_entry, p,
1942 				    rctl->rc_cursor->rcv_value));
1943 			}
1944 			(void) rctl_val_list_delete(&rctl->rc_values, rval);
1945 		}
1946 
1947 		rval = next;
1948 	}
1949 
1950 	rval_priv->rcv_privilege = RCPRIV_PRIVILEGED;
1951 	rval_priv->rcv_flagaction = flagaction;
1952 	if (rlp64->rlim_max == RLIM64_INFINITY) {
1953 		rval_priv->rcv_flagaction |= RCTL_LOCAL_MAXIMAL;
1954 		max = rctl->rc_dict_entry->rcd_max_native;
1955 	} else {
1956 		max = rlp64->rlim_max;
1957 	}
1958 	rval_priv->rcv_value = max;
1959 	rval_priv->rcv_action_signal = signal;
1960 	rval_priv->rcv_action_recipient = NULL;
1961 	rval_priv->rcv_action_recip_pid = -1;
1962 	rval_priv->rcv_firing_time = 0;
1963 	rval_priv->rcv_prev = rval_priv->rcv_next = NULL;
1964 
1965 	(void) rctl_val_list_insert(&rctl->rc_values, rval_priv);
1966 	rctl->rc_cursor = rval_priv;
1967 	rctl_val_list_reset(rctl->rc_cursor);
1968 	RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
1969 	    rctl->rc_cursor->rcv_value));
1970 
1971 	if (rlp64->rlim_cur != RLIM64_INFINITY && rlp64->rlim_cur < max) {
1972 		rval_basic = rctl_gp_detach_val(ragp);
1973 
1974 		rval_basic->rcv_privilege = RCPRIV_BASIC;
1975 		rval_basic->rcv_value = rlp64->rlim_cur;
1976 		rval_basic->rcv_flagaction = flagaction;
1977 		rval_basic->rcv_action_signal = signal;
1978 		rval_basic->rcv_action_recipient = p;
1979 		rval_basic->rcv_action_recip_pid = p->p_pid;
1980 		rval_basic->rcv_firing_time = 0;
1981 		rval_basic->rcv_prev = rval_basic->rcv_next = NULL;
1982 
1983 		(void) rctl_val_list_insert(&rctl->rc_values, rval_basic);
1984 		rctl->rc_cursor = rval_basic;
1985 		rctl_val_list_reset(rctl->rc_cursor);
1986 		RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
1987 		    rctl->rc_cursor->rcv_value));
1988 	}
1989 
1990 	ASSERT(rctl->rc_cursor != NULL);
1991 
1992 	mutex_exit(&rset->rcs_lock);
1993 	return (0);
1994 }
1995 
1996 
1997 /*
1998  * rctl_hndl_t rctl_register(const char *, rctl_entity_t, int, rlim64_t,
1999  *   rlim64_t, rctl_ops_t *)
2000  *
2001  * Overview
2002  *   rctl_register() performs a look-up in the dictionary of rctls
2003  *   active on the system; if a rctl of that name is absent, an entry is
2004  *   made into the dictionary.  The rctl is returned with its reference
2005  *   count incremented by one.  If the rctl name already exists, we panic.
2006  *   (Were the resource control system to support dynamic loading and unloading,
2007  *   which it is structured for, duplicate registration should lead to load
2008  *   failure instead of panicking.)
2009  *
2010  *   Each registered rctl has a requirement that a RCPRIV_SYSTEM limit be
2011  *   defined.  This limit contains the highest possible value for this quantity
2012  *   on the system.  Furthermore, the registered control must provide infinite
2013  *   values for all applicable address space models supported by the operating
2014  *   system.  Attempts to set resource control values beyond the system limit
2015  *   will fail.
2016  *
2017  * Return values
2018  *   The rctl's ID.
2019  *
2020  * Caller's context
2021  *   Caller must be in a context suitable for KM_SLEEP allocations.
2022  */
2023 rctl_hndl_t
2024 rctl_register(
2025     const char *name,
2026     rctl_entity_t entity,
2027     int global_flags,
2028     rlim64_t max_native,
2029     rlim64_t max_ilp32,
2030     rctl_ops_t *ops)
2031 {
2032 	rctl_t *rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
2033 	rctl_val_t *rctl_val = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2034 	rctl_dict_entry_t *rctl_de = kmem_zalloc(sizeof (rctl_dict_entry_t),
2035 	    KM_SLEEP);
2036 	rctl_t *old_rctl;
2037 	rctl_hndl_t rhndl;
2038 	int localflags;
2039 
2040 	ASSERT(ops != NULL);
2041 
2042 	bzero(rctl, sizeof (rctl_t));
2043 	bzero(rctl_val, sizeof (rctl_val_t));
2044 
2045 	if (global_flags & RCTL_GLOBAL_DENY_NEVER)
2046 		localflags = RCTL_LOCAL_MAXIMAL;
2047 	else
2048 		localflags = RCTL_LOCAL_MAXIMAL | RCTL_LOCAL_DENY;
2049 
2050 	rctl_val->rcv_privilege = RCPRIV_SYSTEM;
2051 	rctl_val->rcv_value = max_native;
2052 	rctl_val->rcv_flagaction = localflags;
2053 	rctl_val->rcv_action_signal = 0;
2054 	rctl_val->rcv_action_recipient = NULL;
2055 	rctl_val->rcv_action_recip_pid = -1;
2056 	rctl_val->rcv_firing_time = 0;
2057 	rctl_val->rcv_next = NULL;
2058 	rctl_val->rcv_prev = NULL;
2059 
2060 	rctl_de->rcd_name = (char *)name;
2061 	rctl_de->rcd_default_value = rctl_val;
2062 	rctl_de->rcd_max_native = max_native;
2063 	rctl_de->rcd_max_ilp32 = max_ilp32;
2064 	rctl_de->rcd_entity = entity;
2065 	rctl_de->rcd_ops = ops;
2066 	rctl_de->rcd_flagaction = global_flags;
2067 
2068 	rctl->rc_dict_entry = rctl_de;
2069 	rctl->rc_values = rctl_val;
2070 
2071 	/*
2072 	 * 1.  Take global lock, validate nonexistence of name, get ID.
2073 	 */
2074 	mutex_enter(&rctl_dict_lock);
2075 
2076 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
2077 	    (mod_hash_val_t *)&rhndl) != MH_ERR_NOTFOUND)
2078 		panic("duplicate registration of rctl %s", name);
2079 
2080 	rhndl = rctl_de->rcd_id = rctl->rc_id =
2081 	    (rctl_hndl_t)id_alloc(rctl_ids);
2082 
2083 	/*
2084 	 * 2.  Insert name-entry pair in rctl_dict_by_name.
2085 	 */
2086 	if (mod_hash_insert(rctl_dict_by_name, (mod_hash_key_t)name,
2087 	    (mod_hash_val_t)rctl_de))
2088 		panic("unable to insert rctl dict entry for %s (%u)", name,
2089 		    (uint_t)rctl->rc_id);
2090 
2091 	/*
2092 	 * 3.  Insert ID-rctl_t * pair in rctl_dict.
2093 	 */
2094 	if (mod_hash_find(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2095 	    (mod_hash_val_t *)&old_rctl) != MH_ERR_NOTFOUND)
2096 		panic("duplicate rctl ID %u registered", rctl->rc_id);
2097 
2098 	if (mod_hash_insert(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2099 	    (mod_hash_val_t)rctl))
2100 		panic("unable to insert rctl %s/%u (%p)", name,
2101 		    (uint_t)rctl->rc_id, rctl);
2102 
2103 	/*
2104 	 * 3a. Insert rctl_dict_entry_t * in appropriate entity list.
2105 	 */
2106 
2107 	mutex_enter(&rctl_lists_lock);
2108 
2109 	switch (entity) {
2110 	case RCENTITY_ZONE:
2111 	case RCENTITY_PROJECT:
2112 	case RCENTITY_TASK:
2113 	case RCENTITY_PROCESS:
2114 		rctl_de->rcd_next = rctl_lists[entity];
2115 		rctl_lists[entity] = rctl_de;
2116 		break;
2117 	default:
2118 		panic("registering unknown rctl entity %d (%s)", entity,
2119 		    name);
2120 		break;
2121 	}
2122 
2123 	mutex_exit(&rctl_lists_lock);
2124 
2125 	/*
2126 	 * 4.  Drop lock.
2127 	 */
2128 	mutex_exit(&rctl_dict_lock);
2129 
2130 	return (rhndl);
2131 }
2132 
2133 /*
2134  * static int rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p,
2135  *    rctl_val_t *v)
2136  *
2137  * Overview
2138  *   rctl_global_action() takes, in according with the flags on the rctl_dict
2139  *   entry for the given control, the appropriate actions on the exceeded
2140  *   control value.  Additionally, rctl_global_action() updates the firing time
2141  *   on the exceeded value.
2142  *
2143  * Return values
2144  *   A bitmask reflecting the actions actually taken.
2145  *
2146  * Caller's context
2147  *   No restrictions on context.
2148  */
2149 /*ARGSUSED*/
2150 static int
2151 rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v)
2152 {
2153 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2154 	const char *pr, *en, *idstr;
2155 	id_t id;
2156 	enum {
2157 		SUFFIX_NONE,	/* id consumed directly */
2158 		SUFFIX_NUMERIC,	/* id consumed in suffix */
2159 		SUFFIX_STRING	/* idstr consumed in suffix */
2160 	} suffix = SUFFIX_NONE;
2161 	int ret = 0;
2162 
2163 	v->rcv_firing_time = gethrtime();
2164 
2165 	switch (v->rcv_privilege) {
2166 	case RCPRIV_BASIC:
2167 		pr = "basic";
2168 		break;
2169 	case RCPRIV_PRIVILEGED:
2170 		pr = "privileged";
2171 		break;
2172 	case RCPRIV_SYSTEM:
2173 		pr = "system";
2174 		break;
2175 	default:
2176 		pr = "unknown";
2177 		break;
2178 	}
2179 
2180 	switch (rde->rcd_entity) {
2181 	case RCENTITY_PROCESS:
2182 		en = "process";
2183 		id = p->p_pid;
2184 		suffix = SUFFIX_NONE;
2185 		break;
2186 	case RCENTITY_TASK:
2187 		en = "task";
2188 		id = p->p_task->tk_tkid;
2189 		suffix = SUFFIX_NUMERIC;
2190 		break;
2191 	case RCENTITY_PROJECT:
2192 		en = "project";
2193 		id = p->p_task->tk_proj->kpj_id;
2194 		suffix = SUFFIX_NUMERIC;
2195 		break;
2196 	case RCENTITY_ZONE:
2197 		en = "zone";
2198 		idstr = p->p_zone->zone_name;
2199 		suffix = SUFFIX_STRING;
2200 		break;
2201 	default:
2202 		en = "unknown entity associated with process";
2203 		id = p->p_pid;
2204 		suffix = SUFFIX_NONE;
2205 		break;
2206 	}
2207 
2208 	if (rde->rcd_flagaction & RCTL_GLOBAL_SYSLOG) {
2209 		switch (suffix) {
2210 		default:
2211 		case SUFFIX_NONE:
2212 			(void) strlog(0, 0, 0,
2213 			    rde->rcd_strlog_flags | log_global.lz_active,
2214 			    "%s rctl %s (value %llu) exceeded by %s %d.",
2215 			    pr, rde->rcd_name, v->rcv_value, en, id);
2216 			break;
2217 		case SUFFIX_NUMERIC:
2218 			(void) strlog(0, 0, 0,
2219 			    rde->rcd_strlog_flags | log_global.lz_active,
2220 			    "%s rctl %s (value %llu) exceeded by process %d"
2221 			    " in %s %d.",
2222 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2223 			    en, id);
2224 			break;
2225 		case SUFFIX_STRING:
2226 			(void) strlog(0, 0, 0,
2227 			    rde->rcd_strlog_flags | log_global.lz_active,
2228 			    "%s rctl %s (value %llu) exceeded by process %d"
2229 			    " in %s %s.",
2230 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2231 			    en, idstr);
2232 			break;
2233 		}
2234 	}
2235 
2236 	if (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS)
2237 		ret |= RCT_DENY;
2238 
2239 	return (ret);
2240 }
2241 
2242 static int
2243 rctl_local_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v,
2244     uint_t safety)
2245 {
2246 	int ret = 0;
2247 	sigqueue_t *sqp = NULL;
2248 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2249 	int unobservable = (rde->rcd_flagaction & RCTL_GLOBAL_UNOBSERVABLE);
2250 
2251 	proc_t *recipient = v->rcv_action_recipient;
2252 	id_t recip_pid = v->rcv_action_recip_pid;
2253 	int recip_signal = v->rcv_action_signal;
2254 	uint_t flagaction = v->rcv_flagaction;
2255 
2256 	if (safety == RCA_UNSAFE_ALL) {
2257 		if (flagaction & RCTL_LOCAL_DENY) {
2258 			ret |= RCT_DENY;
2259 		}
2260 		return (ret);
2261 	}
2262 
2263 	if (flagaction & RCTL_LOCAL_SIGNAL) {
2264 		/*
2265 		 * We can build a siginfo only in the case that it is
2266 		 * safe for us to drop p_lock.  (For asynchronous
2267 		 * checks this is currently not true.)
2268 		 */
2269 		if (safety == RCA_SAFE) {
2270 			mutex_exit(&rset->rcs_lock);
2271 			mutex_exit(&p->p_lock);
2272 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
2273 			mutex_enter(&p->p_lock);
2274 			mutex_enter(&rset->rcs_lock);
2275 
2276 			sqp->sq_info.si_signo = recip_signal;
2277 			sqp->sq_info.si_code = SI_RCTL;
2278 			sqp->sq_info.si_errno = 0;
2279 			sqp->sq_info.si_entity = (int)rde->rcd_entity;
2280 		}
2281 
2282 		if (recipient == NULL || recipient == p) {
2283 			ret |= RCT_SIGNAL;
2284 
2285 			if (sqp == NULL) {
2286 				sigtoproc(p, NULL, recip_signal);
2287 			} else if (p == curproc) {
2288 				/*
2289 				 * Then this is a synchronous test and we can
2290 				 * direct the signal at the violating thread.
2291 				 */
2292 				sigaddqa(curproc, curthread, sqp);
2293 			} else {
2294 				sigaddqa(p, NULL, sqp);
2295 			}
2296 		} else if (!unobservable) {
2297 			proc_t *rp;
2298 
2299 			mutex_exit(&rset->rcs_lock);
2300 			mutex_exit(&p->p_lock);
2301 
2302 			mutex_enter(&pidlock);
2303 			if ((rp = prfind(recip_pid)) == recipient) {
2304 				/*
2305 				 * Recipient process is still alive, but may not
2306 				 * be in this task or project any longer.  In
2307 				 * this case, the recipient's resource control
2308 				 * set pertinent to this control will have
2309 				 * changed--and we will not deliver the signal,
2310 				 * as the recipient process is trying to tear
2311 				 * itself off of its former set.
2312 				 */
2313 				mutex_enter(&rp->p_lock);
2314 				mutex_exit(&pidlock);
2315 
2316 				if (rctl_entity_obtain_rset(rde, rp) == rset) {
2317 					ret |= RCT_SIGNAL;
2318 
2319 					if (sqp == NULL)
2320 						sigtoproc(rp, NULL,
2321 						    recip_signal);
2322 					else
2323 						sigaddqa(rp, NULL, sqp);
2324 				} else if (sqp) {
2325 					kmem_free(sqp, sizeof (sigqueue_t));
2326 				}
2327 				mutex_exit(&rp->p_lock);
2328 			} else {
2329 				mutex_exit(&pidlock);
2330 				if (sqp)
2331 					kmem_free(sqp, sizeof (sigqueue_t));
2332 			}
2333 
2334 			mutex_enter(&p->p_lock);
2335 			/*
2336 			 * Since we dropped p_lock, we may no longer be in the
2337 			 * same task or project as we were at entry.  It is thus
2338 			 * unsafe for us to reacquire the set lock at this
2339 			 * point; callers of rctl_local_action() must handle
2340 			 * this possibility.
2341 			 */
2342 			ret |= RCT_LK_ABANDONED;
2343 		} else if (sqp) {
2344 			kmem_free(sqp, sizeof (sigqueue_t));
2345 		}
2346 	}
2347 
2348 	if ((flagaction & RCTL_LOCAL_DENY) &&
2349 	    (recipient == NULL || recipient == p)) {
2350 		ret |= RCT_DENY;
2351 	}
2352 
2353 	return (ret);
2354 }
2355 
2356 /*
2357  * int rctl_action(rctl_hndl_t, rctl_set_t *, struct proc *, uint_t)
2358  *
2359  * Overview
2360  *   Take the action associated with the enforced value (as defined by
2361  *   rctl_get_enforced_value()) being exceeded or encountered.  Possibly perform
2362  *   a restricted subset of the available actions, if circumstances dictate that
2363  *   we cannot safely allocate memory (for a sigqueue_t) or guarantee process
2364  *   persistence across the duration of the function (an asynchronous action).
2365  *
2366  * Return values
2367  *   Actions taken, according to the rctl_test bitmask.
2368  *
2369  * Caller's context
2370  *   Safe to acquire rcs_lock.
2371  */
2372 int
2373 rctl_action(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p, uint_t safety)
2374 {
2375 	return (rctl_action_entity(hndl, rset, p, NULL, safety));
2376 }
2377 
2378 int
2379 rctl_action_entity(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p,
2380     rctl_entity_p_t *e, uint_t safety)
2381 {
2382 	int ret = RCT_NONE;
2383 	rctl_t *lrctl;
2384 	rctl_entity_p_t e_tmp;
2385 
2386 rctl_action_acquire:
2387 	mutex_enter(&rset->rcs_lock);
2388 	if (rctl_set_find(rset, hndl, &lrctl) == -1) {
2389 		mutex_exit(&rset->rcs_lock);
2390 		return (ret);
2391 	}
2392 
2393 	if (e == NULL) {
2394 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2395 		p, &e_tmp);
2396 		e = &e_tmp;
2397 	}
2398 
2399 	if ((ret & RCT_LK_ABANDONED) == 0) {
2400 		ret |= rctl_global_action(lrctl, rset, p, lrctl->rc_cursor);
2401 
2402 		RCTLOP_ACTION(lrctl, p, e);
2403 
2404 		ret |= rctl_local_action(lrctl, rset, p,
2405 		    lrctl->rc_cursor, safety);
2406 
2407 		if (ret & RCT_LK_ABANDONED)
2408 			goto rctl_action_acquire;
2409 	}
2410 
2411 	ret &= ~RCT_LK_ABANDONED;
2412 
2413 	if (!(ret & RCT_DENY) &&
2414 	    lrctl->rc_cursor->rcv_next != NULL) {
2415 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2416 
2417 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2418 		    p, lrctl->rc_cursor->rcv_value));
2419 
2420 	}
2421 	mutex_exit(&rset->rcs_lock);
2422 
2423 	return (ret);
2424 }
2425 
2426 /*
2427  * int rctl_test(rctl_hndl_t, rctl_set_t *, struct proc *, rctl_qty_t, uint_t)
2428  *
2429  * Overview
2430  *   Increment the resource associated with the given handle, returning zero if
2431  *   the incremented value does not exceed the threshold for the current limit
2432  *   on the resource.
2433  *
2434  * Return values
2435  *   Actions taken, according to the rctl_test bitmask.
2436  *
2437  * Caller's context
2438  *   p_lock held by caller.
2439  */
2440 /*ARGSUSED*/
2441 int
2442 rctl_test(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2443     rctl_qty_t incr, uint_t flags)
2444 {
2445 	return (rctl_test_entity(rhndl, rset, p, NULL, incr, flags));
2446 }
2447 
2448 int
2449 rctl_test_entity(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2450     rctl_entity_p_t *e, rctl_qty_t incr, uint_t flags)
2451 {
2452 	rctl_t *lrctl;
2453 	int ret = RCT_NONE;
2454 	rctl_entity_p_t e_tmp;
2455 	if (p == &p0) {
2456 		/*
2457 		 * We don't enforce rctls on the kernel itself.
2458 		 */
2459 		return (ret);
2460 	}
2461 
2462 rctl_test_acquire:
2463 	ASSERT(MUTEX_HELD(&p->p_lock));
2464 
2465 	mutex_enter(&rset->rcs_lock);
2466 
2467 	/*
2468 	 * Dereference from rctl_set.  We don't enforce newly loaded controls
2469 	 * that haven't been set on this entity (since the only valid value is
2470 	 * the infinite system value).
2471 	 */
2472 	if (rctl_set_find(rset, rhndl, &lrctl) == -1) {
2473 		mutex_exit(&rset->rcs_lock);
2474 		return (ret);
2475 	}
2476 
2477 	/*
2478 	 * This control is currently unenforced:  maximal value on control
2479 	 * supporting infinitely available resource.
2480 	 */
2481 	if ((lrctl->rc_dict_entry->rcd_flagaction & RCTL_GLOBAL_INFINITE) &&
2482 	    (lrctl->rc_cursor->rcv_flagaction & RCTL_LOCAL_MAXIMAL)) {
2483 
2484 		mutex_exit(&rset->rcs_lock);
2485 		return (ret);
2486 	}
2487 
2488 	/*
2489 	 * If we have been called by rctl_test, look up the entity pointer
2490 	 * from the proc pointer.
2491 	 */
2492 	if (e == NULL) {
2493 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2494 		p, &e_tmp);
2495 		e = &e_tmp;
2496 	}
2497 
2498 	/*
2499 	 * Get enforced rctl value and current usage.  Test the increment
2500 	 * with the current usage against the enforced value--take action as
2501 	 * necessary.
2502 	 */
2503 	while (RCTLOP_TEST(lrctl, p, e, lrctl->rc_cursor, incr, flags)) {
2504 		if ((ret & RCT_LK_ABANDONED) == 0) {
2505 			ret |= rctl_global_action(lrctl, rset, p,
2506 			    lrctl->rc_cursor);
2507 
2508 			RCTLOP_ACTION(lrctl, p, e);
2509 
2510 			ret |= rctl_local_action(lrctl, rset, p,
2511 			    lrctl->rc_cursor, flags);
2512 
2513 			if (ret & RCT_LK_ABANDONED)
2514 				goto rctl_test_acquire;
2515 		}
2516 
2517 		ret &= ~RCT_LK_ABANDONED;
2518 
2519 		if ((ret & RCT_DENY) == RCT_DENY ||
2520 		    lrctl->rc_cursor->rcv_next == NULL) {
2521 			ret |= RCT_DENY;
2522 			break;
2523 		}
2524 
2525 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2526 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2527 		    p, lrctl->rc_cursor->rcv_value));
2528 	}
2529 
2530 	mutex_exit(&rset->rcs_lock);
2531 
2532 	return (ret);
2533 }
2534 
2535 /*
2536  * void rctl_init(void)
2537  *
2538  * Overview
2539  *   Initialize the rctl subsystem, including the primoridal rctls
2540  *   provided by the system.  New subsystem-specific rctls should _not_ be
2541  *   initialized here.  (Do it in your own file.)
2542  *
2543  * Return values
2544  *   None.
2545  *
2546  * Caller's context
2547  *   Safe for KM_SLEEP allocations.  Must be called prior to any process model
2548  *   initialization.
2549  */
2550 void
2551 rctl_init(void)
2552 {
2553 	rctl_cache = kmem_cache_create("rctl_cache", sizeof (rctl_t),
2554 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2555 	rctl_val_cache = kmem_cache_create("rctl_val_cache",
2556 	    sizeof (rctl_val_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
2557 
2558 	rctl_dict = mod_hash_create_extended("rctl_dict",
2559 	    rctl_dict_size, mod_hash_null_keydtor, rctl_dict_val_dtor,
2560 	    rctl_dict_hash_by_id, NULL, rctl_dict_id_cmp, KM_SLEEP);
2561 	rctl_dict_by_name = mod_hash_create_strhash(
2562 	    "rctl_handles_by_name", rctl_dict_size,
2563 	    mod_hash_null_valdtor);
2564 	rctl_ids = id_space_create("rctl_ids", 1, max_rctl_hndl);
2565 	bzero(rctl_lists, (RC_MAX_ENTITY + 1) * sizeof (rctl_dict_entry_t *));
2566 
2567 	rctlproc_init();
2568 }
2569