xref: /titanic_44/usr/src/uts/common/os/rctl.c (revision 9584cebb1c69707f4c67306b661c2ed47d8676f1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/atomic.h>
26 #include <sys/cmn_err.h>
27 #include <sys/id_space.h>
28 #include <sys/kmem.h>
29 #include <sys/kstat.h>
30 #include <sys/log.h>
31 #include <sys/modctl.h>
32 #include <sys/modhash.h>
33 #include <sys/mutex.h>
34 #include <sys/proc.h>
35 #include <sys/procset.h>
36 #include <sys/project.h>
37 #include <sys/resource.h>
38 #include <sys/rctl.h>
39 #include <sys/siginfo.h>
40 #include <sys/strlog.h>
41 #include <sys/systm.h>
42 #include <sys/task.h>
43 #include <sys/types.h>
44 #include <sys/policy.h>
45 #include <sys/zone.h>
46 
47 /*
48  * Resource controls (rctls)
49  *
50  *   The rctl subsystem provides a mechanism for kernel components to
51  *   register their individual resource controls with the system as a whole,
52  *   such that those controls can subscribe to specific actions while being
53  *   associated with the various process-model entities provided by the kernel:
54  *   the process, the task, the project, and the zone.  (In principle, only
55  *   minor modifications would be required to connect the resource control
56  *   functionality to non-process-model entities associated with the system.)
57  *
58  *   Subsystems register their rctls via rctl_register().  Subsystems
59  *   also wishing to provide additional limits on a given rctl can modify
60  *   them once they have the rctl handle.  Each subsystem should store the
61  *   handle to their rctl for direct access.
62  *
63  *   A primary dictionary, rctl_dict, contains a hash of id to the default
64  *   control definition for each controlled resource-entity pair on the system.
65  *   A secondary dictionary, rctl_dict_by_name, contains a hash of name to
66  *   resource control handles.  The resource control handles are distributed by
67  *   the rctl_ids ID space.  The handles are private and not to be
68  *   advertised to userland; all userland interactions are via the rctl
69  *   names.
70  *
71  *   Entities inherit their rctls from their predecessor.  Since projects have
72  *   no ancestor, they inherit their rctls from the rctl dict for project
73  *   rctls.  It is expected that project controls will be set to their
74  *   appropriate values shortly after project creation, presumably from a
75  *   policy source such as the project database.
76  *
77  * Data structures
78  *   The rctl_set_t attached to each of the process model entities is a simple
79  *   hash table keyed on the rctl handle assigned at registration.  The entries
80  *   in the hash table are rctl_t's, whose relationship with the active control
81  *   values on that resource and with the global state of the resource we
82  *   illustrate below:
83  *
84  *   rctl_dict[key] --> rctl_dict_entry
85  *			   ^
86  *			   |
87  *			+--+---+
88  *   rctl_set[key] ---> | rctl | --> value <-> value <-> system value --> NULL
89  *			+--+---+		 ^
90  *			   |			 |
91  *			   +------- cursor ------+
92  *
93  *   That is, the rctl contains a back pointer to the global resource control
94  *   state for this resource, which is also available in the rctl_dict hash
95  *   table mentioned earlier.  The rctl contains two pointers to resource
96  *   control values:  one, values, indicates the entire sequence of control
97  *   values; the other, cursor, indicates the currently active control
98  *   value--the next value to be enforced.  The value list itself is an open,
99  *   doubly-linked list, the last non-NULL member of which is the system value
100  *   for that resource (being the theoretical/conventional maximum allowable
101  *   value for the resource on this OS instance).
102  *
103  * Ops Vector
104  *   Subsystems publishing rctls need not provide instances of all of the
105  *   functions specified by the ops vector.  In particular, if general
106  *   rctl_*() entry points are not being called, certain functions can be
107  *   omitted.  These align as follows:
108  *
109  *   rctl_set()
110  *     You may wish to provide a set callback if locking circumstances prevent
111  *     it or if the performance cost of requesting the enforced value from the
112  *     resource control is prohibitively expensive.  For instance, the currently
113  *     enforced file size limit is stored on the process in the p_fsz_ctl to
114  *     maintain read()/write() performance.
115  *
116  *   rctl_test()
117  *     You must provide a test callback if you are using the rctl_test()
118  *     interface.  An action callback is optional.
119  *
120  *   rctl_action()
121  *     You may wish to provide an action callback.
122  *
123  * Registration
124  *   New resource controls can be added to a running instance by loaded modules
125  *   via registration.  (The current implementation does not support unloadable
126  *   modules; this functionality can be added if needed, via an
127  *   activation/deactivation interface involving the manipulation of the
128  *   ops vector for the resource control(s) needing to support unloading.)
129  *
130  * Control value ordering
131  *   Because the rctl_val chain on each rctl must be navigable in a
132  *   deterministic way, we have to define an ordering on the rctl_val_t's.  The
133  *   defined order is (flags & [maximal], value, flags & [deny-action],
134  *   privilege).
135  *
136  * Locking
137  *   rctl_dict_lock must be acquired prior to rctl_lists_lock.  Since
138  *   rctl_dict_lock or rctl_lists_lock can be called at the enforcement point
139  *   of any subsystem, holding subsystem locks, it is at all times inappropriate
140  *   to call kmem_alloc(., KM_SLEEP) while holding either of these locks.
141  *   Traversing any of the various resource control entity lists requires
142  *   holding rctl_lists_lock.
143  *
144  *   Each individual resource control set associated with an entity must have
145  *   its rcs_lock held for the duration of any operations that would add
146  *   resource controls or control values to the set.
147  *
148  *   The locking subsequence of interest is: p_lock, rctl_dict_lock,
149  *   rctl_lists_lock, entity->rcs_lock.
150  *
151  * The projects(4) database and project entity resource controls
152  *   A special case is made for RCENTITY_PROJECT values set through the
153  *   setproject(3PROJECT) interface.  setproject() makes use of a private
154  *   interface, setprojrctl(), which passes through an array of resource control
155  *   blocks that need to be set while holding the entity->rcs_lock.  This
156  *   ensures that the act of modifying a project's resource controls is
157  *   "atomic" within the kernel.
158  *
159  *   Within the rctl sub-system, we provide two interfaces that are only used by
160  *   the setprojrctl() code path - rctl_local_insert_all() and
161  *   rctl_local_replace_all().  rctl_local_insert_all() will ensure that the
162  *   resource values specified in *new_values are applied.
163  *   rctl_local_replace_all() will purge the current rctl->rc_projdb and
164  *   rctl->rc_values entries, and apply the *new_values.
165  *
166  *   These functions modify not only the linked list of active resource controls
167  *   (rctl->rc_values), but also a "cached" linked list (rctl->rc_projdb) of
168  *   values set through these interfaces.  To clarify:
169  *
170  *      rctl->rc_values - a linked list of rctl_val_t.  These are the active
171  *      resource values associated with this rctl, and may have been set by
172  *      setrctl() - via prctl(1M), or by setprojrctl() - via
173  *      setproject(3PROJECT).
174  *
175  *      rctl->rc_projdb - a linked list of rctl_val_t.  These reflect the
176  *      resource values set by the setprojrctl() code path.  rc_projdb is not
177  *      referenced by any other component of the rctl sub-system.
178  *
179  *   As various locks are held when calling these functions, we ensure that all
180  *   the possible memory allocations are performed prior to calling the
181  *   function.  *alloc_values is a linked list of uninitialized rctl_val_t,
182  *   which may be used to duplicate a new resource control value (passed in as
183  *   one of the members of the *new_values linked list), in order to populate
184  *   rctl->rc_values.
185  */
186 
187 id_t max_rctl_hndl = 32768;
188 int rctl_dict_size = 64;
189 int rctl_set_size = 8;
190 kmutex_t rctl_dict_lock;
191 mod_hash_t *rctl_dict;
192 mod_hash_t *rctl_dict_by_name;
193 id_space_t *rctl_ids;
194 kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
195 kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
196 
197 kmutex_t rctl_lists_lock;
198 rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
199 
200 /*
201  * Default resource control operations and ops vector
202  *   To be used if the particular rcontrol has no specific actions defined, or
203  *   if the subsystem providing the control is quiescing (in preparation for
204  *   unloading, presumably.)
205  *
206  *   Resource controls with callbacks should fill the unused operations with the
207  *   appropriate default impotent callback.
208  */
209 /*ARGSUSED*/
210 void
211 rcop_no_action(struct rctl *r, struct proc *p, rctl_entity_p_t *e)
212 {
213 }
214 
215 /*ARGSUSED*/
216 rctl_qty_t
217 rcop_no_usage(struct rctl *r, struct proc *p)
218 {
219 	return (0);
220 }
221 
222 /*ARGSUSED*/
223 int
224 rcop_no_set(struct rctl *r, struct proc *p, rctl_entity_p_t *e, rctl_qty_t l)
225 {
226 	return (0);
227 }
228 
229 /*ARGSUSED*/
230 int
231 rcop_no_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
232     struct rctl_val *rv, rctl_qty_t i, uint_t f)
233 {
234 	return (0);
235 }
236 
237 rctl_ops_t rctl_default_ops = {
238 	rcop_no_action,
239 	rcop_no_usage,
240 	rcop_no_set,
241 	rcop_no_test
242 };
243 
244 /*
245  * Default "absolute" resource control operation and ops vector
246  *   Useful if there is no usage associated with the
247  *   resource control.
248  */
249 /*ARGSUSED*/
250 int
251 rcop_absolute_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
252     struct rctl_val *rv, rctl_qty_t i, uint_t f)
253 {
254 	return (i > rv->rcv_value);
255 }
256 
257 rctl_ops_t rctl_absolute_ops = {
258 	rcop_no_action,
259 	rcop_no_usage,
260 	rcop_no_set,
261 	rcop_absolute_test
262 };
263 
264 /*ARGSUSED*/
265 static uint_t
266 rctl_dict_hash_by_id(void *hash_data, mod_hash_key_t key)
267 {
268 	return ((uint_t)(uintptr_t)key % rctl_dict_size);
269 }
270 
271 static int
272 rctl_dict_id_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
273 {
274 	uint_t u1 = (uint_t)(uintptr_t)key1;
275 	uint_t u2 = (uint_t)(uintptr_t)key2;
276 
277 	if (u1 > u2)
278 		return (1);
279 
280 	if (u1 == u2)
281 		return (0);
282 
283 	return (-1);
284 }
285 
286 static void
287 rctl_dict_val_dtor(mod_hash_val_t val)
288 {
289 	rctl_dict_entry_t *kr = (rctl_dict_entry_t *)val;
290 
291 	kmem_free(kr, sizeof (rctl_dict_entry_t));
292 }
293 
294 /*
295  * size_t rctl_build_name_buf()
296  *
297  * Overview
298  *   rctl_build_name_buf() walks all active resource controls in the dictionary,
299  *   building a buffer of continguous NUL-terminated strings.
300  *
301  * Return values
302  *   The size of the buffer is returned, the passed pointer's contents are
303  *   modified to that of the location of the buffer.
304  *
305  * Caller's context
306  *   Caller must be in a context suitable for KM_SLEEP allocations.
307  */
308 size_t
309 rctl_build_name_buf(char **rbufp)
310 {
311 	size_t req_size, cpy_size;
312 	char *rbufloc;
313 	int i;
314 
315 rctl_rebuild_name_buf:
316 	req_size = cpy_size = 0;
317 
318 	/*
319 	 * Calculate needed buffer length.
320 	 */
321 	mutex_enter(&rctl_lists_lock);
322 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
323 		rctl_dict_entry_t *rde;
324 
325 		for (rde = rctl_lists[i];
326 		    rde != NULL;
327 		    rde = rde->rcd_next)
328 			req_size += strlen(rde->rcd_name) + 1;
329 	}
330 	mutex_exit(&rctl_lists_lock);
331 
332 	rbufloc = *rbufp = kmem_alloc(req_size, KM_SLEEP);
333 
334 	/*
335 	 * Copy rctl names into our buffer.  If the copy length exceeds the
336 	 * allocate length (due to registration changes), stop copying, free the
337 	 * buffer, and start again.
338 	 */
339 	mutex_enter(&rctl_lists_lock);
340 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
341 		rctl_dict_entry_t *rde;
342 
343 		for (rde = rctl_lists[i];
344 		    rde != NULL;
345 		    rde = rde->rcd_next) {
346 			size_t length = strlen(rde->rcd_name) + 1;
347 
348 			cpy_size += length;
349 
350 			if (cpy_size > req_size) {
351 				kmem_free(*rbufp, req_size);
352 				mutex_exit(&rctl_lists_lock);
353 				goto rctl_rebuild_name_buf;
354 			}
355 
356 			bcopy(rde->rcd_name, rbufloc, length);
357 			rbufloc += length;
358 		}
359 	}
360 	mutex_exit(&rctl_lists_lock);
361 
362 	return (req_size);
363 }
364 
365 /*
366  * rctl_dict_entry_t *rctl_dict_lookup(const char *)
367  *
368  * Overview
369  *   rctl_dict_lookup() returns the resource control dictionary entry for the
370  *   named resource control.
371  *
372  * Return values
373  *   A pointer to the appropriate resource control dictionary entry, or NULL if
374  *   no such named entry exists.
375  *
376  * Caller's context
377  *   Caller must not be holding rctl_dict_lock.
378  */
379 rctl_dict_entry_t *
380 rctl_dict_lookup(const char *name)
381 {
382 	rctl_dict_entry_t *rde;
383 
384 	mutex_enter(&rctl_dict_lock);
385 
386 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
387 	    (mod_hash_val_t *)&rde) == MH_ERR_NOTFOUND) {
388 		mutex_exit(&rctl_dict_lock);
389 		return (NULL);
390 	}
391 
392 	mutex_exit(&rctl_dict_lock);
393 
394 	return (rde);
395 }
396 
397 /*
398  * rctl_hndl_t rctl_hndl_lookup(const char *)
399  *
400  * Overview
401  *   rctl_hndl_lookup() returns the resource control id (the "handle") for the
402  *   named resource control.
403  *
404  * Return values
405  *   The appropriate id, or -1 if no such named entry exists.
406  *
407  * Caller's context
408  *   Caller must not be holding rctl_dict_lock.
409  */
410 rctl_hndl_t
411 rctl_hndl_lookup(const char *name)
412 {
413 	rctl_dict_entry_t *rde;
414 
415 	if ((rde = rctl_dict_lookup(name)) == NULL)
416 		return (-1);
417 
418 	return (rde->rcd_id);
419 }
420 
421 /*
422  * rctl_dict_entry_t * rctl_dict_lookup_hndl(rctl_hndl_t)
423  *
424  * Overview
425  *   rctl_dict_lookup_hndl() completes the public lookup functions, by returning
426  *   the resource control dictionary entry matching a given resource control id.
427  *
428  * Return values
429  *   A pointer to the matching resource control dictionary entry, or NULL if the
430  *   id does not match any existing entries.
431  *
432  * Caller's context
433  *   Caller must not be holding rctl_lists_lock.
434  */
435 rctl_dict_entry_t *
436 rctl_dict_lookup_hndl(rctl_hndl_t hndl)
437 {
438 	uint_t i;
439 
440 	mutex_enter(&rctl_lists_lock);
441 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
442 		rctl_dict_entry_t *rde;
443 
444 		for (rde = rctl_lists[i];
445 		    rde != NULL;
446 		    rde = rde->rcd_next)
447 			if (rde->rcd_id == hndl) {
448 				mutex_exit(&rctl_lists_lock);
449 				return (rde);
450 			}
451 	}
452 	mutex_exit(&rctl_lists_lock);
453 
454 	return (NULL);
455 }
456 
457 /*
458  * void rctl_add_default_limit(const char *name, rctl_qty_t value,
459  *     rctl_priv_t privilege, uint_t action)
460  *
461  * Overview
462  *   Create a default limit with specified value, privilege, and action.
463  *
464  * Return value
465  *   No value returned.
466  */
467 void
468 rctl_add_default_limit(const char *name, rctl_qty_t value,
469     rctl_priv_t privilege, uint_t action)
470 {
471 	rctl_val_t *dval;
472 	rctl_dict_entry_t *rde;
473 
474 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
475 	bzero(dval, sizeof (rctl_val_t));
476 	dval->rcv_value = value;
477 	dval->rcv_privilege = privilege;
478 	dval->rcv_flagaction = action;
479 	dval->rcv_action_recip_pid = -1;
480 
481 	rde = rctl_dict_lookup(name);
482 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
483 }
484 
485 /*
486  * void rctl_add_legacy_limit(const char *name, const char *mname,
487  *     const char *lname, rctl_qty_t dflt)
488  *
489  * Overview
490  *   Create a default privileged limit, using the value obtained from
491  *   /etc/system if it exists and is greater than the specified default
492  *   value.  Exists primarily for System V IPC.
493  *
494  * Return value
495  *   No value returned.
496  */
497 void
498 rctl_add_legacy_limit(const char *name, const char *mname, const char *lname,
499     rctl_qty_t dflt, rctl_qty_t max)
500 {
501 	rctl_qty_t qty;
502 
503 	if (!mod_sysvar(mname, lname, &qty) || (qty < dflt))
504 		qty = dflt;
505 
506 	if (qty > max)
507 		qty = max;
508 
509 	rctl_add_default_limit(name, qty, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
510 }
511 
512 rctl_set_t *
513 rctl_entity_obtain_rset(rctl_dict_entry_t *rcd, struct proc *p)
514 {
515 	rctl_set_t *rset = NULL;
516 
517 	if (rcd == NULL)
518 		return (NULL);
519 
520 	switch (rcd->rcd_entity) {
521 	case RCENTITY_PROCESS:
522 		rset = p->p_rctls;
523 		break;
524 	case RCENTITY_TASK:
525 		ASSERT(MUTEX_HELD(&p->p_lock));
526 		if (p->p_task != NULL)
527 			rset = p->p_task->tk_rctls;
528 		break;
529 	case RCENTITY_PROJECT:
530 		ASSERT(MUTEX_HELD(&p->p_lock));
531 		if (p->p_task != NULL &&
532 		    p->p_task->tk_proj != NULL)
533 			rset = p->p_task->tk_proj->kpj_rctls;
534 		break;
535 	case RCENTITY_ZONE:
536 		ASSERT(MUTEX_HELD(&p->p_lock));
537 		if (p->p_zone != NULL)
538 			rset = p->p_zone->zone_rctls;
539 		break;
540 	default:
541 		panic("unknown rctl entity type %d seen", rcd->rcd_entity);
542 		break;
543 	}
544 
545 	return (rset);
546 }
547 
548 static void
549 rctl_entity_obtain_entity_p(rctl_entity_t entity, struct proc *p,
550     rctl_entity_p_t *e)
551 {
552 	e->rcep_p.proc = NULL;
553 	e->rcep_t = entity;
554 
555 	switch (entity) {
556 	case RCENTITY_PROCESS:
557 		e->rcep_p.proc = p;
558 		break;
559 	case RCENTITY_TASK:
560 		ASSERT(MUTEX_HELD(&p->p_lock));
561 		if (p->p_task != NULL)
562 			e->rcep_p.task = p->p_task;
563 		break;
564 	case RCENTITY_PROJECT:
565 		ASSERT(MUTEX_HELD(&p->p_lock));
566 		if (p->p_task != NULL &&
567 		    p->p_task->tk_proj != NULL)
568 			e->rcep_p.proj = p->p_task->tk_proj;
569 		break;
570 	case RCENTITY_ZONE:
571 		ASSERT(MUTEX_HELD(&p->p_lock));
572 		if (p->p_zone != NULL)
573 			e->rcep_p.zone = p->p_zone;
574 		break;
575 	default:
576 		panic("unknown rctl entity type %d seen", entity);
577 		break;
578 	}
579 }
580 
581 static void
582 rctl_gp_alloc(rctl_alloc_gp_t *rcgp)
583 {
584 	uint_t i;
585 
586 	if (rcgp->rcag_nctls > 0) {
587 		rctl_t *prev = kmem_cache_alloc(rctl_cache, KM_SLEEP);
588 		rctl_t *rctl = prev;
589 
590 		rcgp->rcag_ctls = prev;
591 
592 		for (i = 1; i < rcgp->rcag_nctls; i++) {
593 			rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
594 			prev->rc_next = rctl;
595 			prev = rctl;
596 		}
597 
598 		rctl->rc_next = NULL;
599 	}
600 
601 	if (rcgp->rcag_nvals > 0) {
602 		rctl_val_t *prev = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
603 		rctl_val_t *rval = prev;
604 
605 		rcgp->rcag_vals = prev;
606 
607 		for (i = 1; i < rcgp->rcag_nvals; i++) {
608 			rval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
609 			prev->rcv_next = rval;
610 			prev = rval;
611 		}
612 
613 		rval->rcv_next = NULL;
614 	}
615 
616 }
617 
618 static rctl_val_t *
619 rctl_gp_detach_val(rctl_alloc_gp_t *rcgp)
620 {
621 	rctl_val_t *rval = rcgp->rcag_vals;
622 
623 	ASSERT(rcgp->rcag_nvals > 0);
624 	rcgp->rcag_nvals--;
625 	rcgp->rcag_vals = rval->rcv_next;
626 
627 	rval->rcv_next = NULL;
628 
629 	return (rval);
630 }
631 
632 static rctl_t *
633 rctl_gp_detach_ctl(rctl_alloc_gp_t *rcgp)
634 {
635 	rctl_t *rctl = rcgp->rcag_ctls;
636 
637 	ASSERT(rcgp->rcag_nctls > 0);
638 	rcgp->rcag_nctls--;
639 	rcgp->rcag_ctls = rctl->rc_next;
640 
641 	rctl->rc_next = NULL;
642 
643 	return (rctl);
644 
645 }
646 
647 static void
648 rctl_gp_free(rctl_alloc_gp_t *rcgp)
649 {
650 	rctl_val_t *rval = rcgp->rcag_vals;
651 	rctl_t *rctl = rcgp->rcag_ctls;
652 
653 	while (rval != NULL) {
654 		rctl_val_t *next = rval->rcv_next;
655 
656 		kmem_cache_free(rctl_val_cache, rval);
657 		rval = next;
658 	}
659 
660 	while (rctl != NULL) {
661 		rctl_t *next = rctl->rc_next;
662 
663 		kmem_cache_free(rctl_cache, rctl);
664 		rctl = next;
665 	}
666 }
667 
668 /*
669  * void rctl_prealloc_destroy(rctl_alloc_gp_t *)
670  *
671  * Overview
672  *   Release all unused memory allocated via one of the "prealloc" functions:
673  *   rctl_set_init_prealloc, rctl_set_dup_prealloc, or rctl_rlimit_set_prealloc.
674  *
675  * Return values
676  *   None.
677  *
678  * Caller's context
679  *   No restrictions on context.
680  */
681 void
682 rctl_prealloc_destroy(rctl_alloc_gp_t *gp)
683 {
684 	rctl_gp_free(gp);
685 	kmem_free(gp, sizeof (rctl_alloc_gp_t));
686 }
687 
688 /*
689  * int rctl_val_cmp(rctl_val_t *, rctl_val_t *, int)
690  *
691  * Overview
692  *   This function defines an ordering to rctl_val_t's in order to allow
693  *   for correct placement in value lists. When the imprecise flag is set,
694  *   the action recipient is ignored. This is to facilitate insert,
695  *   delete, and replace operations by rctlsys.
696  *
697  * Return values
698  *   0 if the val_t's are are considered identical
699  *   -1 if a is ordered lower than b
700  *   1 if a is lowered higher than b
701  *
702  * Caller's context
703  *   No restrictions on context.
704  */
705 int
706 rctl_val_cmp(rctl_val_t *a, rctl_val_t *b, int imprecise)
707 {
708 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) <
709 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
710 		return (-1);
711 
712 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) >
713 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
714 		return (1);
715 
716 	if (a->rcv_value < b->rcv_value)
717 		return (-1);
718 
719 	if (a->rcv_value > b->rcv_value)
720 		return (1);
721 
722 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) <
723 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
724 		return (-1);
725 
726 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) >
727 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
728 		return (1);
729 
730 	if (a->rcv_privilege < b->rcv_privilege)
731 		return (-1);
732 
733 	if (a->rcv_privilege > b->rcv_privilege)
734 		return (1);
735 
736 	if (imprecise)
737 		return (0);
738 
739 	if (a->rcv_action_recip_pid < b->rcv_action_recip_pid)
740 		return (-1);
741 
742 	if (a->rcv_action_recip_pid > b->rcv_action_recip_pid)
743 		return (1);
744 
745 	return (0);
746 }
747 
748 static rctl_val_t *
749 rctl_val_list_find(rctl_val_t **head, rctl_val_t *cval)
750 {
751 	rctl_val_t *rval = *head;
752 
753 	while (rval != NULL) {
754 		if (rctl_val_cmp(cval, rval, 0) == 0)
755 			return (rval);
756 
757 		rval = rval->rcv_next;
758 	}
759 
760 	return (NULL);
761 
762 }
763 
764 /*
765  * int rctl_val_list_insert(rctl_val_t **, rctl_val_t *)
766  *
767  * Overview
768  *   This function inserts the rctl_val_t into the value list provided.
769  *   The insert is always successful unless if the value is a duplicate
770  *   of one already in the list.
771  *
772  * Return values
773  *    1 if the value was a duplicate of an existing value in the list.
774  *    0 if the insert was successful.
775  */
776 int
777 rctl_val_list_insert(rctl_val_t **root, rctl_val_t *rval)
778 {
779 	rctl_val_t *prev;
780 	int equiv;
781 
782 	rval->rcv_next = NULL;
783 	rval->rcv_prev = NULL;
784 
785 	if (*root == NULL) {
786 		*root = rval;
787 		return (0);
788 	}
789 
790 	equiv = rctl_val_cmp(rval, *root, 0);
791 
792 	if (equiv == 0)
793 		return (1);
794 
795 	if (equiv < 0) {
796 		rval->rcv_next = *root;
797 		rval->rcv_next->rcv_prev = rval;
798 		*root = rval;
799 
800 		return (0);
801 	}
802 
803 	prev = *root;
804 	while (prev->rcv_next != NULL &&
805 	    (equiv = rctl_val_cmp(rval, prev->rcv_next, 0)) > 0) {
806 		prev = prev->rcv_next;
807 	}
808 
809 	if (equiv == 0)
810 		return (1);
811 
812 	rval->rcv_next = prev->rcv_next;
813 	if (rval->rcv_next != NULL)
814 		rval->rcv_next->rcv_prev = rval;
815 	prev->rcv_next = rval;
816 	rval->rcv_prev = prev;
817 
818 	return (0);
819 }
820 
821 static int
822 rctl_val_list_delete(rctl_val_t **root, rctl_val_t *rval)
823 {
824 	rctl_val_t *prev;
825 
826 	if (*root == NULL)
827 		return (-1);
828 
829 	prev = *root;
830 	if (rctl_val_cmp(rval, prev, 0) == 0) {
831 		*root = prev->rcv_next;
832 		if (*root != NULL)
833 			(*root)->rcv_prev = NULL;
834 
835 		kmem_cache_free(rctl_val_cache, prev);
836 
837 		return (0);
838 	}
839 
840 	while (prev->rcv_next != NULL &&
841 	    rctl_val_cmp(rval, prev->rcv_next, 0) != 0) {
842 		prev = prev->rcv_next;
843 	}
844 
845 	if (prev->rcv_next == NULL) {
846 		/*
847 		 * If we navigate the entire list and cannot find a match, then
848 		 * return failure.
849 		 */
850 		return (-1);
851 	}
852 
853 	prev = prev->rcv_next;
854 	prev->rcv_prev->rcv_next = prev->rcv_next;
855 	if (prev->rcv_next != NULL)
856 		prev->rcv_next->rcv_prev = prev->rcv_prev;
857 
858 	kmem_cache_free(rctl_val_cache, prev);
859 
860 	return (0);
861 }
862 
863 static rctl_val_t *
864 rctl_val_list_dup(rctl_val_t *rval, rctl_alloc_gp_t *ragp, struct proc *oldp,
865     struct proc *newp)
866 {
867 	rctl_val_t *head = NULL;
868 
869 	for (; rval != NULL; rval = rval->rcv_next) {
870 		rctl_val_t *dval = rctl_gp_detach_val(ragp);
871 
872 		bcopy(rval, dval, sizeof (rctl_val_t));
873 		dval->rcv_prev = dval->rcv_next = NULL;
874 
875 		if (oldp == NULL ||
876 		    rval->rcv_action_recipient == NULL ||
877 		    rval->rcv_action_recipient == oldp) {
878 			if (rval->rcv_privilege == RCPRIV_BASIC) {
879 				dval->rcv_action_recipient = newp;
880 				dval->rcv_action_recip_pid = newp->p_pid;
881 			} else {
882 				dval->rcv_action_recipient = NULL;
883 				dval->rcv_action_recip_pid = -1;
884 			}
885 
886 			(void) rctl_val_list_insert(&head, dval);
887 		} else {
888 			kmem_cache_free(rctl_val_cache, dval);
889 		}
890 	}
891 
892 	return (head);
893 }
894 
895 static void
896 rctl_val_list_reset(rctl_val_t *rval)
897 {
898 	for (; rval != NULL; rval = rval->rcv_next)
899 		rval->rcv_firing_time = 0;
900 }
901 
902 static uint_t
903 rctl_val_list_count(rctl_val_t *rval)
904 {
905 	uint_t n = 0;
906 
907 	for (; rval != NULL; rval = rval->rcv_next)
908 		n++;
909 
910 	return (n);
911 }
912 
913 
914 static void
915 rctl_val_list_free(rctl_val_t *rval)
916 {
917 	while (rval != NULL) {
918 		rctl_val_t *next = rval->rcv_next;
919 
920 		kmem_cache_free(rctl_val_cache, rval);
921 
922 		rval = next;
923 	}
924 }
925 
926 /*
927  * rctl_qty_t rctl_model_maximum(rctl_dict_entry_t *, struct proc *)
928  *
929  * Overview
930  *   In cases where the operating system supports more than one process
931  *   addressing model, the operating system capabilities will exceed those of
932  *   one or more of these models.  Processes in a less capable model must have
933  *   their resources accurately controlled, without diluting those of their
934  *   descendants reached via exec().  rctl_model_maximum() returns the governing
935  *   value for the specified process with respect to a resource control, such
936  *   that the value can used for the RCTLOP_SET callback or compatability
937  *   support.
938  *
939  * Return values
940  *   The maximum value for the given process for the specified resource control.
941  *
942  * Caller's context
943  *   No restrictions on context.
944  */
945 rctl_qty_t
946 rctl_model_maximum(rctl_dict_entry_t *rde, struct proc *p)
947 {
948 	if (p->p_model == DATAMODEL_NATIVE)
949 		return (rde->rcd_max_native);
950 
951 	return (rde->rcd_max_ilp32);
952 }
953 
954 /*
955  * rctl_qty_t rctl_model_value(rctl_dict_entry_t *, struct proc *, rctl_qty_t)
956  *
957  * Overview
958  *   Convenience function wrapping the rctl_model_maximum() functionality.
959  *
960  * Return values
961  *   The lesser of the process's maximum value and the given value for the
962  *   specified resource control.
963  *
964  * Caller's context
965  *   No restrictions on context.
966  */
967 rctl_qty_t
968 rctl_model_value(rctl_dict_entry_t *rde, struct proc *p, rctl_qty_t value)
969 {
970 	rctl_qty_t max = rctl_model_maximum(rde, p);
971 
972 	return (value < max ? value : max);
973 }
974 
975 static void
976 rctl_set_insert(rctl_set_t *set, rctl_hndl_t hndl, rctl_t *rctl)
977 {
978 	uint_t index = hndl % rctl_set_size;
979 	rctl_t *next_ctl, *prev_ctl;
980 
981 	ASSERT(MUTEX_HELD(&set->rcs_lock));
982 
983 	rctl->rc_next = NULL;
984 
985 	if (set->rcs_ctls[index] == NULL) {
986 		set->rcs_ctls[index] = rctl;
987 		return;
988 	}
989 
990 	if (hndl < set->rcs_ctls[index]->rc_id) {
991 		rctl->rc_next = set->rcs_ctls[index];
992 		set->rcs_ctls[index] = rctl;
993 
994 		return;
995 	}
996 
997 	for (next_ctl = set->rcs_ctls[index]->rc_next,
998 	    prev_ctl = set->rcs_ctls[index];
999 	    next_ctl != NULL;
1000 	    prev_ctl = next_ctl,
1001 	    next_ctl = next_ctl->rc_next) {
1002 		if (next_ctl->rc_id > hndl) {
1003 			rctl->rc_next = next_ctl;
1004 			prev_ctl->rc_next = rctl;
1005 
1006 			return;
1007 		}
1008 	}
1009 
1010 	rctl->rc_next = next_ctl;
1011 	prev_ctl->rc_next = rctl;
1012 }
1013 
1014 /*
1015  * rctl_set_t *rctl_set_create()
1016  *
1017  * Overview
1018  *   Create an empty resource control set, suitable for attaching to a
1019  *   controlled entity.
1020  *
1021  * Return values
1022  *   A pointer to the newly created set.
1023  *
1024  * Caller's context
1025  *   Safe for KM_SLEEP allocations.
1026  */
1027 rctl_set_t *
1028 rctl_set_create()
1029 {
1030 	rctl_set_t *rset = kmem_zalloc(sizeof (rctl_set_t), KM_SLEEP);
1031 
1032 	mutex_init(&rset->rcs_lock, NULL, MUTEX_DEFAULT, NULL);
1033 	rset->rcs_ctls = kmem_zalloc(rctl_set_size * sizeof (rctl_t *),
1034 	    KM_SLEEP);
1035 	rset->rcs_entity = -1;
1036 
1037 	return (rset);
1038 }
1039 
1040 /*
1041  * rctl_gp_alloc_t *rctl_set_init_prealloc(rctl_entity_t)
1042  *
1043  * Overview
1044  *    rctl_set_init_prealloc() examines the globally defined resource controls
1045  *    and their default values and returns a resource control allocation group
1046  *    populated with sufficient controls and values to form a representative
1047  *    resource control set for the specified entity.
1048  *
1049  * Return values
1050  *    A pointer to the newly created allocation group.
1051  *
1052  * Caller's context
1053  *    Caller must be in a context suitable for KM_SLEEP allocations.
1054  */
1055 rctl_alloc_gp_t *
1056 rctl_set_init_prealloc(rctl_entity_t entity)
1057 {
1058 	rctl_dict_entry_t *rde;
1059 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1060 
1061 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1062 
1063 	if (rctl_lists[entity] == NULL)
1064 		return (ragp);
1065 
1066 	mutex_enter(&rctl_lists_lock);
1067 
1068 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1069 		ragp->rcag_nctls++;
1070 		ragp->rcag_nvals += rctl_val_list_count(rde->rcd_default_value);
1071 	}
1072 
1073 	mutex_exit(&rctl_lists_lock);
1074 
1075 	rctl_gp_alloc(ragp);
1076 
1077 	return (ragp);
1078 }
1079 
1080 /*
1081  * rctl_set_t *rctl_set_init(rctl_entity_t)
1082  *
1083  * Overview
1084  *   rctl_set_create() creates a resource control set, initialized with the
1085  *   system infinite values on all registered controls, for attachment to a
1086  *   system entity requiring resource controls, such as a process or a task.
1087  *
1088  * Return values
1089  *   A pointer to the newly filled set.
1090  *
1091  * Caller's context
1092  *   Caller must be holding p_lock on entry so that RCTLOP_SET() functions
1093  *   may modify task and project members based on the proc structure
1094  *   they are passed.
1095  */
1096 rctl_set_t *
1097 rctl_set_init(rctl_entity_t entity, struct proc *p, rctl_entity_p_t *e,
1098     rctl_set_t *rset, rctl_alloc_gp_t *ragp)
1099 {
1100 	rctl_dict_entry_t *rde;
1101 
1102 	ASSERT(MUTEX_HELD(&p->p_lock));
1103 	ASSERT(e);
1104 	rset->rcs_entity = entity;
1105 
1106 	if (rctl_lists[entity] == NULL)
1107 		return (rset);
1108 
1109 	mutex_enter(&rctl_lists_lock);
1110 	mutex_enter(&rset->rcs_lock);
1111 
1112 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1113 		rctl_t *rctl = rctl_gp_detach_ctl(ragp);
1114 
1115 		rctl->rc_dict_entry = rde;
1116 		rctl->rc_id = rde->rcd_id;
1117 		rctl->rc_projdb = NULL;
1118 
1119 		rctl->rc_values = rctl_val_list_dup(rde->rcd_default_value,
1120 		    ragp, NULL, p);
1121 		rctl->rc_cursor = rctl->rc_values;
1122 
1123 		ASSERT(rctl->rc_cursor != NULL);
1124 
1125 		rctl_set_insert(rset, rde->rcd_id, rctl);
1126 
1127 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1128 		    rctl->rc_cursor->rcv_value));
1129 	}
1130 
1131 	mutex_exit(&rset->rcs_lock);
1132 	mutex_exit(&rctl_lists_lock);
1133 
1134 	return (rset);
1135 }
1136 
1137 static rctl_t *
1138 rctl_dup(rctl_t *rctl, rctl_alloc_gp_t *ragp, struct proc *oldp,
1139     struct proc *newp)
1140 {
1141 	rctl_t *dup = rctl_gp_detach_ctl(ragp);
1142 	rctl_val_t *dval;
1143 
1144 	dup->rc_id = rctl->rc_id;
1145 	dup->rc_dict_entry = rctl->rc_dict_entry;
1146 	dup->rc_next = NULL;
1147 	dup->rc_cursor = NULL;
1148 	dup->rc_values = rctl_val_list_dup(rctl->rc_values, ragp, oldp, newp);
1149 
1150 	for (dval = dup->rc_values;
1151 	    dval != NULL; dval = dval->rcv_next) {
1152 		if (rctl_val_cmp(rctl->rc_cursor, dval, 0) >= 0) {
1153 			dup->rc_cursor = dval;
1154 			break;
1155 		}
1156 	}
1157 
1158 	if (dup->rc_cursor == NULL)
1159 		dup->rc_cursor = dup->rc_values;
1160 
1161 	return (dup);
1162 }
1163 
1164 static void
1165 rctl_set_fill_alloc_gp(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1166 {
1167 	uint_t i;
1168 
1169 	bzero(ragp, sizeof (rctl_alloc_gp_t));
1170 
1171 	for (i = 0; i < rctl_set_size; i++) {
1172 		rctl_t *r = set->rcs_ctls[i];
1173 
1174 		while (r != NULL) {
1175 			ragp->rcag_nctls++;
1176 
1177 			ragp->rcag_nvals += rctl_val_list_count(r->rc_values);
1178 
1179 			r = r->rc_next;
1180 		}
1181 	}
1182 }
1183 
1184 /*
1185  * rctl_alloc_gp_t *rctl_set_dup_prealloc(rctl_set_t *)
1186  *
1187  * Overview
1188  *   Given a resource control set, allocate a sufficiently large allocation
1189  *   group to contain a duplicate of the set.
1190  *
1191  * Return value
1192  *   A pointer to the newly created allocation group.
1193  *
1194  * Caller's context
1195  *   Safe for KM_SLEEP allocations.
1196  */
1197 rctl_alloc_gp_t *
1198 rctl_set_dup_prealloc(rctl_set_t *set)
1199 {
1200 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1201 
1202 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1203 
1204 	mutex_enter(&set->rcs_lock);
1205 	rctl_set_fill_alloc_gp(set, ragp);
1206 	mutex_exit(&set->rcs_lock);
1207 
1208 	rctl_gp_alloc(ragp);
1209 
1210 	return (ragp);
1211 }
1212 
1213 /*
1214  * int rctl_set_dup_ready(rctl_set_t *, rctl_alloc_gp_t *)
1215  *
1216  * Overview
1217  *   Verify that the allocation group provided is large enough to allow a
1218  *   duplicate of the given resource control set to be constructed from its
1219  *   contents.
1220  *
1221  * Return values
1222  *   1 if the allocation group is sufficiently large, 0 otherwise.
1223  *
1224  * Caller's context
1225  *   rcs_lock must be held prior to entry.
1226  */
1227 int
1228 rctl_set_dup_ready(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1229 {
1230 	rctl_alloc_gp_t curr_gp;
1231 
1232 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1233 
1234 	rctl_set_fill_alloc_gp(set, &curr_gp);
1235 
1236 	if (curr_gp.rcag_nctls <= ragp->rcag_nctls &&
1237 	    curr_gp.rcag_nvals <= ragp->rcag_nvals)
1238 		return (1);
1239 
1240 	return (0);
1241 }
1242 
1243 /*
1244  * rctl_set_t *rctl_set_dup(rctl_set_t *, struct proc *, struct proc *,
1245  *   rctl_set_t *, rctl_alloc_gp_t *, int)
1246  *
1247  * Overview
1248  *   Make a duplicate of the resource control set.  The proc pointers are those
1249  *   of the owning process and of the process associated with the entity
1250  *   receiving the duplicate.
1251  *
1252  *   Duplication is a 3 stage process. Stage 1 is memory allocation for
1253  *   the duplicate set, which is taken care of by rctl_set_dup_prealloc().
1254  *   Stage 2 consists of copying all rctls and values from the old set into
1255  *   the new. Stage 3 completes the duplication by performing the appropriate
1256  *   callbacks for each rctl in the new set.
1257  *
1258  *   Stages 2 and 3 are handled by calling rctl_set_dup with the RCD_DUP and
1259  *   RCD_CALLBACK functions, respectively. The RCD_CALLBACK flag may only
1260  *   be supplied if the newp proc structure reflects the new task and
1261  *   project linkage.
1262  *
1263  * Return value
1264  *   A pointer to the duplicate set.
1265  *
1266  * Caller's context
1267  *   The rcs_lock of the set to be duplicated must be held prior to entry.
1268  */
1269 rctl_set_t *
1270 rctl_set_dup(rctl_set_t *set, struct proc *oldp, struct proc *newp,
1271     rctl_entity_p_t *e, rctl_set_t *dup, rctl_alloc_gp_t *ragp, int flag)
1272 {
1273 	uint_t i;
1274 	rctl_set_t	*iter;
1275 
1276 	ASSERT((flag & RCD_DUP) || (flag & RCD_CALLBACK));
1277 	ASSERT(e);
1278 	/*
1279 	 * When copying the old set, iterate over that. Otherwise, when
1280 	 * only callbacks have been requested, iterate over the dup set.
1281 	 */
1282 	if (flag & RCD_DUP) {
1283 		ASSERT(MUTEX_HELD(&set->rcs_lock));
1284 		iter = set;
1285 		dup->rcs_entity = set->rcs_entity;
1286 	} else {
1287 		iter = dup;
1288 	}
1289 
1290 	mutex_enter(&dup->rcs_lock);
1291 
1292 	for (i = 0; i < rctl_set_size; i++) {
1293 		rctl_t *r = iter->rcs_ctls[i];
1294 		rctl_t *d;
1295 
1296 		while (r != NULL) {
1297 			if (flag & RCD_DUP) {
1298 				d = rctl_dup(r, ragp, oldp, newp);
1299 				rctl_set_insert(dup, r->rc_id, d);
1300 			} else {
1301 				d = r;
1302 			}
1303 
1304 			if (flag & RCD_CALLBACK)
1305 				RCTLOP_SET(d, newp, e,
1306 				    rctl_model_value(d->rc_dict_entry, newp,
1307 				    d->rc_cursor->rcv_value));
1308 
1309 			r = r->rc_next;
1310 		}
1311 	}
1312 
1313 	mutex_exit(&dup->rcs_lock);
1314 
1315 	return (dup);
1316 }
1317 
1318 /*
1319  * void rctl_set_free(rctl_set_t *)
1320  *
1321  * Overview
1322  *   Delete resource control set and all attached values.
1323  *
1324  * Return values
1325  *   No value returned.
1326  *
1327  * Caller's context
1328  *   No restrictions on context.
1329  */
1330 void
1331 rctl_set_free(rctl_set_t *set)
1332 {
1333 	uint_t i;
1334 
1335 	mutex_enter(&set->rcs_lock);
1336 	for (i = 0; i < rctl_set_size; i++) {
1337 		rctl_t *r = set->rcs_ctls[i];
1338 
1339 		while (r != NULL) {
1340 			rctl_val_t *v = r->rc_values;
1341 			rctl_t *n = r->rc_next;
1342 
1343 			kmem_cache_free(rctl_cache, r);
1344 
1345 			rctl_val_list_free(v);
1346 
1347 			r = n;
1348 		}
1349 	}
1350 	mutex_exit(&set->rcs_lock);
1351 
1352 	kmem_free(set->rcs_ctls, sizeof (rctl_t *) * rctl_set_size);
1353 	kmem_free(set, sizeof (rctl_set_t));
1354 }
1355 
1356 /*
1357  * void rctl_set_reset(rctl_set_t *)
1358  *
1359  * Overview
1360  *   Resets all rctls within the set such that the lowest value becomes active.
1361  *
1362  * Return values
1363  *   No value returned.
1364  *
1365  * Caller's context
1366  *   No restrictions on context.
1367  */
1368 void
1369 rctl_set_reset(rctl_set_t *set, struct proc *p, rctl_entity_p_t *e)
1370 {
1371 	uint_t i;
1372 
1373 	ASSERT(e);
1374 
1375 	mutex_enter(&set->rcs_lock);
1376 	for (i = 0; i < rctl_set_size; i++) {
1377 		rctl_t *r = set->rcs_ctls[i];
1378 
1379 		while (r != NULL) {
1380 			r->rc_cursor = r->rc_values;
1381 			rctl_val_list_reset(r->rc_cursor);
1382 			RCTLOP_SET(r, p, e, rctl_model_value(r->rc_dict_entry,
1383 			    p, r->rc_cursor->rcv_value));
1384 
1385 			ASSERT(r->rc_cursor != NULL);
1386 
1387 			r = r->rc_next;
1388 		}
1389 	}
1390 
1391 	mutex_exit(&set->rcs_lock);
1392 }
1393 
1394 /*
1395  * void rctl_set_tearoff(rctl_set *, struct proc *)
1396  *
1397  * Overview
1398  *   Tear off any resource control values on this set with an action recipient
1399  *   equal to the specified process (as they are becoming invalid with the
1400  *   process's departure from this set as an observer).
1401  *
1402  * Return values
1403  *   No value returned.
1404  *
1405  * Caller's context
1406  *   No restrictions on context
1407  */
1408 void
1409 rctl_set_tearoff(rctl_set_t *set, struct proc *p)
1410 {
1411 	uint_t i;
1412 
1413 	mutex_enter(&set->rcs_lock);
1414 	for (i = 0; i < rctl_set_size; i++) {
1415 		rctl_t *r = set->rcs_ctls[i];
1416 
1417 		while (r != NULL) {
1418 			rctl_val_t *rval;
1419 
1420 tearoff_rewalk_list:
1421 			rval = r->rc_values;
1422 
1423 			while (rval != NULL) {
1424 				if (rval->rcv_privilege == RCPRIV_BASIC &&
1425 				    rval->rcv_action_recipient == p) {
1426 					if (r->rc_cursor == rval)
1427 						r->rc_cursor = rval->rcv_next;
1428 
1429 					(void) rctl_val_list_delete(
1430 					    &r->rc_values, rval);
1431 
1432 					goto tearoff_rewalk_list;
1433 				}
1434 
1435 				rval = rval->rcv_next;
1436 			}
1437 
1438 			ASSERT(r->rc_cursor != NULL);
1439 
1440 			r = r->rc_next;
1441 		}
1442 	}
1443 
1444 	mutex_exit(&set->rcs_lock);
1445 }
1446 
1447 int
1448 rctl_set_find(rctl_set_t *set, rctl_hndl_t hndl, rctl_t **rctl)
1449 {
1450 	uint_t index = hndl % rctl_set_size;
1451 	rctl_t *curr_ctl;
1452 
1453 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1454 
1455 	for (curr_ctl = set->rcs_ctls[index]; curr_ctl != NULL;
1456 	    curr_ctl = curr_ctl->rc_next) {
1457 		if (curr_ctl->rc_id == hndl) {
1458 			*rctl = curr_ctl;
1459 
1460 			return (0);
1461 		}
1462 	}
1463 
1464 	return (-1);
1465 }
1466 
1467 /*
1468  * rlim64_t rctl_enforced_value(rctl_hndl_t, rctl_set_t *, struct proc *)
1469  *
1470  * Overview
1471  *   Given a process, get the next enforced value on the rctl of the specified
1472  *   handle.
1473  *
1474  * Return value
1475  *   The enforced value.
1476  *
1477  * Caller's context
1478  *   For controls on process collectives, p->p_lock must be held across the
1479  *   operation.
1480  */
1481 /*ARGSUSED*/
1482 rctl_qty_t
1483 rctl_enforced_value(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p)
1484 {
1485 	rctl_t *rctl;
1486 	rlim64_t ret;
1487 
1488 	mutex_enter(&rset->rcs_lock);
1489 
1490 	if (rctl_set_find(rset, hndl, &rctl) == -1)
1491 		panic("unknown resource control handle %d requested", hndl);
1492 	else
1493 		ret = rctl_model_value(rctl->rc_dict_entry, p,
1494 		    rctl->rc_cursor->rcv_value);
1495 
1496 	mutex_exit(&rset->rcs_lock);
1497 
1498 	return (ret);
1499 }
1500 
1501 /*
1502  * int rctl_global_get(const char *, rctl_dict_entry_t *)
1503  *
1504  * Overview
1505  *   Copy a sanitized version of the global rctl for a given resource control
1506  *   name.  (By sanitization, we mean that the unsafe data pointers have been
1507  *   zeroed.)
1508  *
1509  * Return value
1510  *   -1 if name not defined, 0 otherwise.
1511  *
1512  * Caller's context
1513  *   No restrictions on context.  rctl_dict_lock must not be held.
1514  */
1515 int
1516 rctl_global_get(const char *name, rctl_dict_entry_t *drde)
1517 {
1518 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1519 
1520 	if (rde == NULL)
1521 		return (-1);
1522 
1523 	bcopy(rde, drde, sizeof (rctl_dict_entry_t));
1524 
1525 	drde->rcd_next = NULL;
1526 	drde->rcd_ops = NULL;
1527 
1528 	return (0);
1529 }
1530 
1531 /*
1532  * int rctl_global_set(const char *, rctl_dict_entry_t *)
1533  *
1534  * Overview
1535  *   Transfer the settable fields of the named rctl to the global rctl matching
1536  *   the given resource control name.
1537  *
1538  * Return value
1539  *   -1 if name not defined, 0 otherwise.
1540  *
1541  * Caller's context
1542  *   No restrictions on context.  rctl_dict_lock must not be held.
1543  */
1544 int
1545 rctl_global_set(const char *name, rctl_dict_entry_t *drde)
1546 {
1547 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1548 
1549 	if (rde == NULL)
1550 		return (-1);
1551 
1552 	rde->rcd_flagaction = drde->rcd_flagaction;
1553 	rde->rcd_syslog_level = drde->rcd_syslog_level;
1554 	rde->rcd_strlog_flags = drde->rcd_strlog_flags;
1555 
1556 	return (0);
1557 }
1558 
1559 static int
1560 rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1561     int (*cbop)(rctl_hndl_t, struct proc *p, rctl_entity_p_t *e, rctl_t *,
1562     rctl_val_t *, rctl_val_t *), struct proc *p)
1563 {
1564 	rctl_t *rctl;
1565 	rctl_set_t *rset;
1566 	rctl_entity_p_t e;
1567 	int ret = 0;
1568 	rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
1569 
1570 local_op_retry:
1571 
1572 	ASSERT(MUTEX_HELD(&p->p_lock));
1573 
1574 	rset = rctl_entity_obtain_rset(rde, p);
1575 
1576 	if (rset == NULL) {
1577 		return (-1);
1578 	}
1579 	rctl_entity_obtain_entity_p(rset->rcs_entity, p, &e);
1580 
1581 	mutex_enter(&rset->rcs_lock);
1582 
1583 	/* using rctl's hndl, get rctl from local set */
1584 	if (rctl_set_find(rset, hndl, &rctl) == -1) {
1585 		mutex_exit(&rset->rcs_lock);
1586 		return (-1);
1587 	}
1588 
1589 	ret = cbop(hndl, p, &e, rctl, oval, nval);
1590 
1591 	mutex_exit(&rset->rcs_lock);
1592 	return (ret);
1593 }
1594 
1595 /*ARGSUSED*/
1596 static int
1597 rctl_local_get_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1598     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1599 {
1600 	if (oval == NULL) {
1601 		/*
1602 		 * RCTL_FIRST
1603 		 */
1604 		bcopy(rctl->rc_values, nval, sizeof (rctl_val_t));
1605 	} else {
1606 		/*
1607 		 * RCTL_NEXT
1608 		 */
1609 		rctl_val_t *tval = rctl_val_list_find(&rctl->rc_values, oval);
1610 
1611 		if (tval == NULL)
1612 			return (ESRCH);
1613 		else if (tval->rcv_next == NULL)
1614 			return (ENOENT);
1615 		else
1616 			bcopy(tval->rcv_next, nval, sizeof (rctl_val_t));
1617 	}
1618 
1619 	return (0);
1620 }
1621 
1622 /*
1623  * int rctl_local_get(rctl_hndl_t, rctl_val_t *)
1624  *
1625  * Overview
1626  *   Get the rctl value for the given flags.
1627  *
1628  * Return values
1629  *   0 for successful get, errno otherwise.
1630  */
1631 int
1632 rctl_local_get(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1633     struct proc *p)
1634 {
1635 	return (rctl_local_op(hndl, oval, nval, rctl_local_get_cb, p));
1636 }
1637 
1638 /*ARGSUSED*/
1639 static int
1640 rctl_local_delete_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1641     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1642 {
1643 	if ((oval = rctl_val_list_find(&rctl->rc_values, nval)) == NULL)
1644 		return (ESRCH);
1645 
1646 	if (rctl->rc_cursor == oval) {
1647 		rctl->rc_cursor = oval->rcv_next;
1648 		rctl_val_list_reset(rctl->rc_cursor);
1649 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1650 		    rctl->rc_cursor->rcv_value));
1651 
1652 		ASSERT(rctl->rc_cursor != NULL);
1653 	}
1654 
1655 	(void) rctl_val_list_delete(&rctl->rc_values, oval);
1656 
1657 	return (0);
1658 }
1659 
1660 /*
1661  * int rctl_local_delete(rctl_hndl_t, rctl_val_t *)
1662  *
1663  * Overview
1664  *   Delete the rctl value for the given flags.
1665  *
1666  * Return values
1667  *   0 for successful delete, errno otherwise.
1668  */
1669 int
1670 rctl_local_delete(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1671 {
1672 	return (rctl_local_op(hndl, NULL, val, rctl_local_delete_cb, p));
1673 }
1674 
1675 /*
1676  * rctl_local_insert_cb()
1677  *
1678  * Overview
1679  *   Insert a new value into the rctl's val list. If an error occurs,
1680  *   the val list must be left in the same state as when the function
1681  *   was entered.
1682  *
1683  * Return Values
1684  *   0 for successful insert, EINVAL if the value is duplicated in the
1685  *   existing list.
1686  */
1687 /*ARGSUSED*/
1688 static int
1689 rctl_local_insert_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1690     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1691 {
1692 	/*
1693 	 * Before inserting, confirm there are no duplicates of this value
1694 	 * and flag level. If there is a duplicate, flag an error and do
1695 	 * nothing.
1696 	 */
1697 	if (rctl_val_list_insert(&rctl->rc_values, nval) != 0)
1698 		return (EINVAL);
1699 
1700 	if (rctl_val_cmp(nval, rctl->rc_cursor, 0) < 0) {
1701 		rctl->rc_cursor = nval;
1702 		rctl_val_list_reset(rctl->rc_cursor);
1703 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1704 		    rctl->rc_cursor->rcv_value));
1705 
1706 		ASSERT(rctl->rc_cursor != NULL);
1707 	}
1708 
1709 	return (0);
1710 }
1711 
1712 /*
1713  * int rctl_local_insert(rctl_hndl_t, rctl_val_t *)
1714  *
1715  * Overview
1716  *   Insert the rctl value into the appropriate rctl set for the calling
1717  *   process, given the handle.
1718  */
1719 int
1720 rctl_local_insert(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1721 {
1722 	return (rctl_local_op(hndl, NULL, val, rctl_local_insert_cb, p));
1723 }
1724 
1725 /*
1726  * rctl_local_insert_all_cb()
1727  *
1728  * Overview
1729  *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1730  *
1731  *   Inserts new values from the project database (new_values).  alloc_values
1732  *   should be a linked list of pre-allocated rctl_val_t, which are used to
1733  *   populate (rc_projdb).
1734  *
1735  *   Should the *new_values linked list match the contents of the rctl's
1736  *   rp_projdb then we do nothing.
1737  *
1738  * Return Values
1739  *   0 is always returned.
1740  */
1741 /*ARGSUSED*/
1742 static int
1743 rctl_local_insert_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1744     rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1745 {
1746 	rctl_val_t *val;
1747 	rctl_val_t *tmp_val;
1748 	rctl_val_t *next;
1749 	int modified = 0;
1750 
1751 	/*
1752 	 * If this the first time we've set this project rctl, then we delete
1753 	 * all the privilege values.  These privilege values have been set by
1754 	 * rctl_add_default_limit().
1755 	 *
1756 	 * We save some cycles here by not calling rctl_val_list_delete().
1757 	 */
1758 	if (rctl->rc_projdb == NULL) {
1759 		val = rctl->rc_values;
1760 
1761 		while (val != NULL) {
1762 			if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1763 				if (val->rcv_prev != NULL)
1764 					val->rcv_prev->rcv_next = val->rcv_next;
1765 				else
1766 					rctl->rc_values = val->rcv_next;
1767 
1768 				if (val->rcv_next != NULL)
1769 					val->rcv_next->rcv_prev = val->rcv_prev;
1770 
1771 				tmp_val = val;
1772 				val = val->rcv_next;
1773 				kmem_cache_free(rctl_val_cache, tmp_val);
1774 			} else {
1775 				val = val->rcv_next;
1776 			}
1777 		}
1778 		modified = 1;
1779 	}
1780 
1781 	/*
1782 	 * Delete active values previously set through the project database.
1783 	 */
1784 	val = rctl->rc_projdb;
1785 
1786 	while (val != NULL) {
1787 
1788 		/* Is the old value found in the new values? */
1789 		if (rctl_val_list_find(&new_values, val) == NULL) {
1790 
1791 			/*
1792 			 * Delete from the active values if it originated from
1793 			 * the project database.
1794 			 */
1795 			if (((tmp_val = rctl_val_list_find(&rctl->rc_values,
1796 			    val)) != NULL) &&
1797 			    (tmp_val->rcv_flagaction & RCTL_LOCAL_PROJDB)) {
1798 				(void) rctl_val_list_delete(&rctl->rc_values,
1799 				    tmp_val);
1800 			}
1801 
1802 			tmp_val = val->rcv_next;
1803 			(void) rctl_val_list_delete(&rctl->rc_projdb, val);
1804 			val = tmp_val;
1805 			modified = 1;
1806 
1807 		} else
1808 			val = val->rcv_next;
1809 	}
1810 
1811 	/*
1812 	 * Insert new values from the project database.
1813 	 */
1814 	while (new_values != NULL) {
1815 		next = new_values->rcv_next;
1816 
1817 		/*
1818 		 * Insert this new value into the rc_projdb, and duplicate this
1819 		 * entry to the active list.
1820 		 */
1821 		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1822 
1823 			tmp_val = alloc_values->rcv_next;
1824 			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1825 			alloc_values->rcv_next = tmp_val;
1826 
1827 			if (rctl_val_list_insert(&rctl->rc_values,
1828 			    alloc_values) == 0) {
1829 				/* inserted move alloc_values on */
1830 				alloc_values = tmp_val;
1831 				modified = 1;
1832 			}
1833 		} else {
1834 			/*
1835 			 * Unlike setrctl() we don't want to return an error on
1836 			 * a duplicate entry; we are concerned solely with
1837 			 * ensuring that all the values specified are set.
1838 			 */
1839 			kmem_cache_free(rctl_val_cache, new_values);
1840 		}
1841 		new_values = next;
1842 	}
1843 
1844 	/* Teardown any unused rctl_val_t */
1845 	while (alloc_values != NULL) {
1846 		tmp_val = alloc_values;
1847 		alloc_values = alloc_values->rcv_next;
1848 		kmem_cache_free(rctl_val_cache, tmp_val);
1849 	}
1850 
1851 	/* Reset the cursor if rctl values have been modified */
1852 	if (modified) {
1853 		rctl->rc_cursor = rctl->rc_values;
1854 		rctl_val_list_reset(rctl->rc_cursor);
1855 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1856 		    rctl->rc_cursor->rcv_value));
1857 	}
1858 
1859 	return (0);
1860 }
1861 
1862 int
1863 rctl_local_insert_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1864     rctl_val_t *alloc_values, struct proc *p)
1865 {
1866 	return (rctl_local_op(hndl, new_values, alloc_values,
1867 	    rctl_local_insert_all_cb, p));
1868 }
1869 
1870 /*
1871  * rctl_local_replace_all_cb()
1872  *
1873  * Overview
1874  *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1875  *
1876  *   Clears the active rctl values (rc_values), and stored values from the
1877  *   previous insertions from the project database (rc_projdb).
1878  *
1879  *   Inserts new values from the project database (new_values).  alloc_values
1880  *   should be a linked list of pre-allocated rctl_val_t, which are used to
1881  *   populate (rc_projdb).
1882  *
1883  * Return Values
1884  *   0 is always returned.
1885  */
1886 /*ARGSUSED*/
1887 static int
1888 rctl_local_replace_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1889     rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1890 {
1891 	rctl_val_t *val;
1892 	rctl_val_t *next;
1893 	rctl_val_t *tmp_val;
1894 
1895 	/* Delete all the privilege vaules */
1896 	val = rctl->rc_values;
1897 
1898 	while (val != NULL) {
1899 		if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1900 			if (val->rcv_prev != NULL)
1901 				val->rcv_prev->rcv_next = val->rcv_next;
1902 			else
1903 				rctl->rc_values = val->rcv_next;
1904 
1905 			if (val->rcv_next != NULL)
1906 				val->rcv_next->rcv_prev = val->rcv_prev;
1907 
1908 			tmp_val = val;
1909 			val = val->rcv_next;
1910 			kmem_cache_free(rctl_val_cache, tmp_val);
1911 		} else {
1912 			val = val->rcv_next;
1913 		}
1914 	}
1915 
1916 	/* Delete the contents of rc_projdb */
1917 	val = rctl->rc_projdb;
1918 	while (val != NULL) {
1919 
1920 		tmp_val = val;
1921 		val = val->rcv_next;
1922 		kmem_cache_free(rctl_val_cache, tmp_val);
1923 	}
1924 	rctl->rc_projdb = NULL;
1925 
1926 	/*
1927 	 * Insert new values from the project database.
1928 	 */
1929 	while (new_values != NULL) {
1930 		next = new_values->rcv_next;
1931 
1932 		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1933 			tmp_val = alloc_values->rcv_next;
1934 			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1935 			alloc_values->rcv_next = tmp_val;
1936 
1937 			if (rctl_val_list_insert(&rctl->rc_values,
1938 			    alloc_values) == 0) {
1939 				/* inserted, so move alloc_values on */
1940 				alloc_values = tmp_val;
1941 			}
1942 		} else {
1943 			/*
1944 			 * Unlike setrctl() we don't want to return an error on
1945 			 * a duplicate entry; we are concerned solely with
1946 			 * ensuring that all the values specified are set.
1947 			 */
1948 			kmem_cache_free(rctl_val_cache, new_values);
1949 		}
1950 
1951 		new_values = next;
1952 	}
1953 
1954 	/* Teardown any unused rctl_val_t */
1955 	while (alloc_values != NULL) {
1956 		tmp_val = alloc_values;
1957 		alloc_values = alloc_values->rcv_next;
1958 		kmem_cache_free(rctl_val_cache, tmp_val);
1959 	}
1960 
1961 	/* Always reset the cursor */
1962 	rctl->rc_cursor = rctl->rc_values;
1963 	rctl_val_list_reset(rctl->rc_cursor);
1964 	RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1965 	    rctl->rc_cursor->rcv_value));
1966 
1967 	return (0);
1968 }
1969 
1970 int
1971 rctl_local_replace_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1972     rctl_val_t *alloc_values, struct proc *p)
1973 {
1974 	return (rctl_local_op(hndl, new_values, alloc_values,
1975 	    rctl_local_replace_all_cb, p));
1976 }
1977 
1978 static int
1979 rctl_local_replace_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1980     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1981 {
1982 	int ret;
1983 	rctl_val_t *tmp;
1984 
1985 	/* Verify that old will be delete-able */
1986 	tmp = rctl_val_list_find(&rctl->rc_values, oval);
1987 	if (tmp == NULL)
1988 		return (ESRCH);
1989 	/*
1990 	 * Caller should verify that value being deleted is not the
1991 	 * system value.
1992 	 */
1993 	ASSERT(tmp->rcv_privilege != RCPRIV_SYSTEM);
1994 
1995 	/*
1996 	 * rctl_local_insert_cb() does the job of flagging an error
1997 	 * for any duplicate values. So, call rctl_local_insert_cb()
1998 	 * for the new value first, then do deletion of the old value.
1999 	 * Since this is a callback function to rctl_local_op, we can
2000 	 * count on rcs_lock being held at this point. This guarantees
2001 	 * that there is at no point a visible list which contains both
2002 	 * new and old values.
2003 	 */
2004 	if (ret = rctl_local_insert_cb(hndl, p, e, rctl, NULL, nval))
2005 		return (ret);
2006 
2007 	ret = rctl_local_delete_cb(hndl, p, e, rctl, NULL, oval);
2008 	ASSERT(ret == 0);
2009 	return (0);
2010 }
2011 
2012 /*
2013  * int rctl_local_replace(rctl_hndl_t, void *, int, uint64_t *)
2014  *
2015  * Overview
2016  *   Replace the rctl value with a new one.
2017  *
2018  * Return values
2019  *   0 for successful replace, errno otherwise.
2020  */
2021 int
2022 rctl_local_replace(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
2023     struct proc *p)
2024 {
2025 	return (rctl_local_op(hndl, oval, nval, rctl_local_replace_cb, p));
2026 }
2027 
2028 /*
2029  * int rctl_rlimit_get(rctl_hndl_t, struct proc *, struct rlimit64 *)
2030  *
2031  * Overview
2032  *   To support rlimit compatibility, we need a function which takes a 64-bit
2033  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2034  *   This operation is only intended for legacy rlimits.
2035  */
2036 int
2037 rctl_rlimit_get(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64)
2038 {
2039 	rctl_t *rctl;
2040 	rctl_val_t *rval;
2041 	rctl_set_t *rset = p->p_rctls;
2042 	int soft_limit_seen = 0;
2043 	int test_for_deny = 1;
2044 
2045 	mutex_enter(&rset->rcs_lock);
2046 	if (rctl_set_find(rset, rc, &rctl) == -1) {
2047 		mutex_exit(&rset->rcs_lock);
2048 		return (-1);
2049 	}
2050 
2051 	rval = rctl->rc_values;
2052 
2053 	if (rctl->rc_dict_entry->rcd_flagaction & (RCTL_GLOBAL_DENY_NEVER |
2054 	    RCTL_GLOBAL_DENY_ALWAYS))
2055 		test_for_deny = 0;
2056 
2057 	/*
2058 	 * 1.  Find the first control value with the RCTL_LOCAL_DENY bit set.
2059 	 */
2060 	while (rval != NULL && rval->rcv_privilege != RCPRIV_SYSTEM) {
2061 		if (test_for_deny &&
2062 		    (rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0) {
2063 			rval = rval->rcv_next;
2064 			continue;
2065 		}
2066 
2067 		/*
2068 		 * 2.  If this is an RCPRIV_BASIC value, then we've found the
2069 		 * effective soft limit and should set rlim_cur.  We should then
2070 		 * continue looking for another control value with the DENY bit
2071 		 * set.
2072 		 */
2073 		if (rval->rcv_privilege == RCPRIV_BASIC) {
2074 			if (soft_limit_seen) {
2075 				rval = rval->rcv_next;
2076 				continue;
2077 			}
2078 
2079 			if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2080 			    rval->rcv_value < rctl_model_maximum(
2081 			    rctl->rc_dict_entry, p))
2082 				rlp64->rlim_cur = rval->rcv_value;
2083 			else
2084 				rlp64->rlim_cur = RLIM64_INFINITY;
2085 			soft_limit_seen = 1;
2086 
2087 			rval = rval->rcv_next;
2088 			continue;
2089 		}
2090 
2091 		/*
2092 		 * 3.  This is an RCPRIV_PRIVILEGED value.  If we haven't found
2093 		 * a soft limit candidate, then we've found the effective hard
2094 		 * and soft limits and should set both  If we had found a soft
2095 		 * limit, then this is only the hard limit and we need only set
2096 		 * rlim_max.
2097 		 */
2098 		if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2099 		    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry,
2100 		    p))
2101 			rlp64->rlim_max = rval->rcv_value;
2102 		else
2103 			rlp64->rlim_max = RLIM64_INFINITY;
2104 		if (!soft_limit_seen)
2105 			rlp64->rlim_cur = rlp64->rlim_max;
2106 
2107 		mutex_exit(&rset->rcs_lock);
2108 		return (0);
2109 	}
2110 
2111 	if (rval == NULL) {
2112 		/*
2113 		 * This control sequence is corrupt, as it is not terminated by
2114 		 * a system privileged control value.
2115 		 */
2116 		mutex_exit(&rset->rcs_lock);
2117 		return (-1);
2118 	}
2119 
2120 	/*
2121 	 * 4.  If we run into a RCPRIV_SYSTEM value, then the hard limit (and
2122 	 * the soft, if we haven't a soft candidate) should be the value of the
2123 	 * system control value.
2124 	 */
2125 	if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2126 	    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry, p))
2127 		rlp64->rlim_max = rval->rcv_value;
2128 	else
2129 		rlp64->rlim_max = RLIM64_INFINITY;
2130 
2131 	if (!soft_limit_seen)
2132 		rlp64->rlim_cur = rlp64->rlim_max;
2133 
2134 	mutex_exit(&rset->rcs_lock);
2135 	return (0);
2136 }
2137 
2138 /*
2139  * rctl_alloc_gp_t *rctl_rlimit_set_prealloc(uint_t)
2140  *
2141  * Overview
2142  *   Before making a series of calls to rctl_rlimit_set(), we must have a
2143  *   preallocated batch of resource control values, as rctl_rlimit_set() can
2144  *   potentially consume two resource control values per call.
2145  *
2146  * Return values
2147  *   A populated resource control allocation group with 2n resource control
2148  *   values.
2149  *
2150  * Caller's context
2151  *   Must be safe for KM_SLEEP allocations.
2152  */
2153 rctl_alloc_gp_t *
2154 rctl_rlimit_set_prealloc(uint_t n)
2155 {
2156 	rctl_alloc_gp_t *gp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
2157 
2158 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
2159 
2160 	gp->rcag_nvals = 2 * n;
2161 
2162 	rctl_gp_alloc(gp);
2163 
2164 	return (gp);
2165 }
2166 
2167 /*
2168  * int rctl_rlimit_set(rctl_hndl_t, struct proc *, struct rlimit64 *, int,
2169  *   int)
2170  *
2171  * Overview
2172  *   To support rlimit compatibility, we need a function which takes a 64-bit
2173  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2174  *   This operation is only intended for legacy rlimits.
2175  *
2176  *   The implementation of rctl_rlimit_set() is a bit clever, as it tries to
2177  *   minimize the number of values placed on the value sequence in various
2178  *   cases.  Furthermore, we don't allow multiple identical privilege-action
2179  *   values on the same sequence.  (That is, we don't want a sequence like
2180  *   "while (1) { rlim.rlim_cur++; setrlimit(..., rlim); }" to exhaust kernel
2181  *   memory.)  So we want to delete any values with the same privilege value and
2182  *   action.
2183  *
2184  * Return values
2185  *   0 for successful set, errno otherwise. Errno will be either EINVAL
2186  *   or EPERM, in keeping with defined errnos for ulimit() and setrlimit()
2187  *   system calls.
2188  */
2189 /*ARGSUSED*/
2190 int
2191 rctl_rlimit_set(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64,
2192     rctl_alloc_gp_t *ragp, int flagaction, int signal, const cred_t *cr)
2193 {
2194 	rctl_t *rctl;
2195 	rctl_val_t *rval, *rval_priv, *rval_basic;
2196 	rctl_set_t *rset = p->p_rctls;
2197 	rctl_qty_t max;
2198 	rctl_entity_p_t e;
2199 	struct rlimit64 cur_rl;
2200 
2201 	e.rcep_t = RCENTITY_PROCESS;
2202 	e.rcep_p.proc = p;
2203 
2204 	if (rlp64->rlim_cur > rlp64->rlim_max)
2205 		return (EINVAL);
2206 
2207 	if (rctl_rlimit_get(rc, p, &cur_rl) == -1)
2208 		return (EINVAL);
2209 
2210 	/*
2211 	 * If we are not privileged, we can only lower the hard limit.
2212 	 */
2213 	if ((rlp64->rlim_max > cur_rl.rlim_max) &&
2214 	    cur_rl.rlim_max != RLIM64_INFINITY &&
2215 	    secpolicy_resource(cr) != 0)
2216 		return (EPERM);
2217 
2218 	mutex_enter(&rset->rcs_lock);
2219 
2220 	if (rctl_set_find(rset, rc, &rctl) == -1) {
2221 		mutex_exit(&rset->rcs_lock);
2222 		return (EINVAL);
2223 	}
2224 
2225 	rval_priv = rctl_gp_detach_val(ragp);
2226 
2227 	rval = rctl->rc_values;
2228 
2229 	while (rval != NULL) {
2230 		rctl_val_t *next = rval->rcv_next;
2231 
2232 		if (rval->rcv_privilege == RCPRIV_SYSTEM)
2233 			break;
2234 
2235 		if ((rval->rcv_privilege == RCPRIV_BASIC) ||
2236 		    (rval->rcv_flagaction & ~RCTL_LOCAL_ACTION_MASK) ==
2237 		    (flagaction & ~RCTL_LOCAL_ACTION_MASK)) {
2238 			if (rctl->rc_cursor == rval) {
2239 				rctl->rc_cursor = rval->rcv_next;
2240 				rctl_val_list_reset(rctl->rc_cursor);
2241 				RCTLOP_SET(rctl, p, &e, rctl_model_value(
2242 				    rctl->rc_dict_entry, p,
2243 				    rctl->rc_cursor->rcv_value));
2244 			}
2245 			(void) rctl_val_list_delete(&rctl->rc_values, rval);
2246 		}
2247 
2248 		rval = next;
2249 	}
2250 
2251 	rval_priv->rcv_privilege = RCPRIV_PRIVILEGED;
2252 	rval_priv->rcv_flagaction = flagaction;
2253 	if (rlp64->rlim_max == RLIM64_INFINITY) {
2254 		rval_priv->rcv_flagaction |= RCTL_LOCAL_MAXIMAL;
2255 		max = rctl->rc_dict_entry->rcd_max_native;
2256 	} else {
2257 		max = rlp64->rlim_max;
2258 	}
2259 	rval_priv->rcv_value = max;
2260 	rval_priv->rcv_action_signal = signal;
2261 	rval_priv->rcv_action_recipient = NULL;
2262 	rval_priv->rcv_action_recip_pid = -1;
2263 	rval_priv->rcv_firing_time = 0;
2264 	rval_priv->rcv_prev = rval_priv->rcv_next = NULL;
2265 
2266 	(void) rctl_val_list_insert(&rctl->rc_values, rval_priv);
2267 	rctl->rc_cursor = rval_priv;
2268 	rctl_val_list_reset(rctl->rc_cursor);
2269 	RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2270 	    rctl->rc_cursor->rcv_value));
2271 
2272 	if (rlp64->rlim_cur != RLIM64_INFINITY && rlp64->rlim_cur < max) {
2273 		rval_basic = rctl_gp_detach_val(ragp);
2274 
2275 		rval_basic->rcv_privilege = RCPRIV_BASIC;
2276 		rval_basic->rcv_value = rlp64->rlim_cur;
2277 		rval_basic->rcv_flagaction = flagaction;
2278 		rval_basic->rcv_action_signal = signal;
2279 		rval_basic->rcv_action_recipient = p;
2280 		rval_basic->rcv_action_recip_pid = p->p_pid;
2281 		rval_basic->rcv_firing_time = 0;
2282 		rval_basic->rcv_prev = rval_basic->rcv_next = NULL;
2283 
2284 		(void) rctl_val_list_insert(&rctl->rc_values, rval_basic);
2285 		rctl->rc_cursor = rval_basic;
2286 		rctl_val_list_reset(rctl->rc_cursor);
2287 		RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2288 		    rctl->rc_cursor->rcv_value));
2289 	}
2290 
2291 	ASSERT(rctl->rc_cursor != NULL);
2292 
2293 	mutex_exit(&rset->rcs_lock);
2294 	return (0);
2295 }
2296 
2297 
2298 /*
2299  * rctl_hndl_t rctl_register(const char *, rctl_entity_t, int, rlim64_t,
2300  *   rlim64_t, rctl_ops_t *)
2301  *
2302  * Overview
2303  *   rctl_register() performs a look-up in the dictionary of rctls
2304  *   active on the system; if a rctl of that name is absent, an entry is
2305  *   made into the dictionary.  The rctl is returned with its reference
2306  *   count incremented by one.  If the rctl name already exists, we panic.
2307  *   (Were the resource control system to support dynamic loading and unloading,
2308  *   which it is structured for, duplicate registration should lead to load
2309  *   failure instead of panicking.)
2310  *
2311  *   Each registered rctl has a requirement that a RCPRIV_SYSTEM limit be
2312  *   defined.  This limit contains the highest possible value for this quantity
2313  *   on the system.  Furthermore, the registered control must provide infinite
2314  *   values for all applicable address space models supported by the operating
2315  *   system.  Attempts to set resource control values beyond the system limit
2316  *   will fail.
2317  *
2318  * Return values
2319  *   The rctl's ID.
2320  *
2321  * Caller's context
2322  *   Caller must be in a context suitable for KM_SLEEP allocations.
2323  */
2324 rctl_hndl_t
2325 rctl_register(
2326     const char *name,
2327     rctl_entity_t entity,
2328     int global_flags,
2329     rlim64_t max_native,
2330     rlim64_t max_ilp32,
2331     rctl_ops_t *ops)
2332 {
2333 	rctl_t *rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
2334 	rctl_val_t *rctl_val = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2335 	rctl_dict_entry_t *rctl_de = kmem_zalloc(sizeof (rctl_dict_entry_t),
2336 	    KM_SLEEP);
2337 	rctl_t *old_rctl;
2338 	rctl_hndl_t rhndl;
2339 	int localflags;
2340 
2341 	ASSERT(ops != NULL);
2342 
2343 	bzero(rctl, sizeof (rctl_t));
2344 	bzero(rctl_val, sizeof (rctl_val_t));
2345 
2346 	if (global_flags & RCTL_GLOBAL_DENY_NEVER)
2347 		localflags = RCTL_LOCAL_MAXIMAL;
2348 	else
2349 		localflags = RCTL_LOCAL_MAXIMAL | RCTL_LOCAL_DENY;
2350 
2351 	rctl_val->rcv_privilege = RCPRIV_SYSTEM;
2352 	rctl_val->rcv_value = max_native;
2353 	rctl_val->rcv_flagaction = localflags;
2354 	rctl_val->rcv_action_signal = 0;
2355 	rctl_val->rcv_action_recipient = NULL;
2356 	rctl_val->rcv_action_recip_pid = -1;
2357 	rctl_val->rcv_firing_time = 0;
2358 	rctl_val->rcv_next = NULL;
2359 	rctl_val->rcv_prev = NULL;
2360 
2361 	rctl_de->rcd_name = (char *)name;
2362 	rctl_de->rcd_default_value = rctl_val;
2363 	rctl_de->rcd_max_native = max_native;
2364 	rctl_de->rcd_max_ilp32 = max_ilp32;
2365 	rctl_de->rcd_entity = entity;
2366 	rctl_de->rcd_ops = ops;
2367 	rctl_de->rcd_flagaction = global_flags;
2368 
2369 	rctl->rc_dict_entry = rctl_de;
2370 	rctl->rc_values = rctl_val;
2371 
2372 	/*
2373 	 * 1.  Take global lock, validate nonexistence of name, get ID.
2374 	 */
2375 	mutex_enter(&rctl_dict_lock);
2376 
2377 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
2378 	    (mod_hash_val_t *)&rhndl) != MH_ERR_NOTFOUND)
2379 		panic("duplicate registration of rctl %s", name);
2380 
2381 	rhndl = rctl_de->rcd_id = rctl->rc_id =
2382 	    (rctl_hndl_t)id_alloc(rctl_ids);
2383 
2384 	/*
2385 	 * 2.  Insert name-entry pair in rctl_dict_by_name.
2386 	 */
2387 	if (mod_hash_insert(rctl_dict_by_name, (mod_hash_key_t)name,
2388 	    (mod_hash_val_t)rctl_de))
2389 		panic("unable to insert rctl dict entry for %s (%u)", name,
2390 		    (uint_t)rctl->rc_id);
2391 
2392 	/*
2393 	 * 3.  Insert ID-rctl_t * pair in rctl_dict.
2394 	 */
2395 	if (mod_hash_find(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2396 	    (mod_hash_val_t *)&old_rctl) != MH_ERR_NOTFOUND)
2397 		panic("duplicate rctl ID %u registered", rctl->rc_id);
2398 
2399 	if (mod_hash_insert(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2400 	    (mod_hash_val_t)rctl))
2401 		panic("unable to insert rctl %s/%u (%p)", name,
2402 		    (uint_t)rctl->rc_id, (void *)rctl);
2403 
2404 	/*
2405 	 * 3a. Insert rctl_dict_entry_t * in appropriate entity list.
2406 	 */
2407 
2408 	mutex_enter(&rctl_lists_lock);
2409 
2410 	switch (entity) {
2411 	case RCENTITY_ZONE:
2412 	case RCENTITY_PROJECT:
2413 	case RCENTITY_TASK:
2414 	case RCENTITY_PROCESS:
2415 		rctl_de->rcd_next = rctl_lists[entity];
2416 		rctl_lists[entity] = rctl_de;
2417 		break;
2418 	default:
2419 		panic("registering unknown rctl entity %d (%s)", entity,
2420 		    name);
2421 		break;
2422 	}
2423 
2424 	mutex_exit(&rctl_lists_lock);
2425 
2426 	/*
2427 	 * 4.  Drop lock.
2428 	 */
2429 	mutex_exit(&rctl_dict_lock);
2430 
2431 	return (rhndl);
2432 }
2433 
2434 /*
2435  * static int rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p,
2436  *    rctl_val_t *v)
2437  *
2438  * Overview
2439  *   rctl_global_action() takes, in according with the flags on the rctl_dict
2440  *   entry for the given control, the appropriate actions on the exceeded
2441  *   control value.  Additionally, rctl_global_action() updates the firing time
2442  *   on the exceeded value.
2443  *
2444  * Return values
2445  *   A bitmask reflecting the actions actually taken.
2446  *
2447  * Caller's context
2448  *   No restrictions on context.
2449  */
2450 /*ARGSUSED*/
2451 static int
2452 rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v)
2453 {
2454 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2455 	const char *pr, *en, *idstr;
2456 	id_t id;
2457 	enum {
2458 		SUFFIX_NONE,	/* id consumed directly */
2459 		SUFFIX_NUMERIC,	/* id consumed in suffix */
2460 		SUFFIX_STRING	/* idstr consumed in suffix */
2461 	} suffix = SUFFIX_NONE;
2462 	int ret = 0;
2463 
2464 	v->rcv_firing_time = gethrtime();
2465 
2466 	switch (v->rcv_privilege) {
2467 	case RCPRIV_BASIC:
2468 		pr = "basic";
2469 		break;
2470 	case RCPRIV_PRIVILEGED:
2471 		pr = "privileged";
2472 		break;
2473 	case RCPRIV_SYSTEM:
2474 		pr = "system";
2475 		break;
2476 	default:
2477 		pr = "unknown";
2478 		break;
2479 	}
2480 
2481 	switch (rde->rcd_entity) {
2482 	case RCENTITY_PROCESS:
2483 		en = "process";
2484 		id = p->p_pid;
2485 		suffix = SUFFIX_NONE;
2486 		break;
2487 	case RCENTITY_TASK:
2488 		en = "task";
2489 		id = p->p_task->tk_tkid;
2490 		suffix = SUFFIX_NUMERIC;
2491 		break;
2492 	case RCENTITY_PROJECT:
2493 		en = "project";
2494 		id = p->p_task->tk_proj->kpj_id;
2495 		suffix = SUFFIX_NUMERIC;
2496 		break;
2497 	case RCENTITY_ZONE:
2498 		en = "zone";
2499 		idstr = p->p_zone->zone_name;
2500 		suffix = SUFFIX_STRING;
2501 		break;
2502 	default:
2503 		en = "unknown entity associated with process";
2504 		id = p->p_pid;
2505 		suffix = SUFFIX_NONE;
2506 		break;
2507 	}
2508 
2509 	if (rde->rcd_flagaction & RCTL_GLOBAL_SYSLOG) {
2510 		switch (suffix) {
2511 		default:
2512 		case SUFFIX_NONE:
2513 			(void) strlog(0, 0, 0,
2514 			    rde->rcd_strlog_flags | log_global.lz_active,
2515 			    "%s rctl %s (value %llu) exceeded by %s %d.",
2516 			    pr, rde->rcd_name, v->rcv_value, en, id);
2517 			break;
2518 		case SUFFIX_NUMERIC:
2519 			(void) strlog(0, 0, 0,
2520 			    rde->rcd_strlog_flags | log_global.lz_active,
2521 			    "%s rctl %s (value %llu) exceeded by process %d"
2522 			    " in %s %d.",
2523 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2524 			    en, id);
2525 			break;
2526 		case SUFFIX_STRING:
2527 			(void) strlog(0, 0, 0,
2528 			    rde->rcd_strlog_flags | log_global.lz_active,
2529 			    "%s rctl %s (value %llu) exceeded by process %d"
2530 			    " in %s %s.",
2531 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2532 			    en, idstr);
2533 			break;
2534 		}
2535 	}
2536 
2537 	if (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS)
2538 		ret |= RCT_DENY;
2539 
2540 	return (ret);
2541 }
2542 
2543 static int
2544 rctl_local_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v,
2545     uint_t safety)
2546 {
2547 	int ret = 0;
2548 	sigqueue_t *sqp = NULL;
2549 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2550 	int unobservable = (rde->rcd_flagaction & RCTL_GLOBAL_UNOBSERVABLE);
2551 
2552 	proc_t *recipient = v->rcv_action_recipient;
2553 	id_t recip_pid = v->rcv_action_recip_pid;
2554 	int recip_signal = v->rcv_action_signal;
2555 	uint_t flagaction = v->rcv_flagaction;
2556 
2557 	if (safety == RCA_UNSAFE_ALL) {
2558 		if (flagaction & RCTL_LOCAL_DENY) {
2559 			ret |= RCT_DENY;
2560 		}
2561 		return (ret);
2562 	}
2563 
2564 	if (flagaction & RCTL_LOCAL_SIGNAL) {
2565 		/*
2566 		 * We can build a siginfo only in the case that it is
2567 		 * safe for us to drop p_lock.  (For asynchronous
2568 		 * checks this is currently not true.)
2569 		 */
2570 		if (safety == RCA_SAFE) {
2571 			mutex_exit(&rset->rcs_lock);
2572 			mutex_exit(&p->p_lock);
2573 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
2574 			mutex_enter(&p->p_lock);
2575 			mutex_enter(&rset->rcs_lock);
2576 
2577 			sqp->sq_info.si_signo = recip_signal;
2578 			sqp->sq_info.si_code = SI_RCTL;
2579 			sqp->sq_info.si_errno = 0;
2580 			sqp->sq_info.si_entity = (int)rde->rcd_entity;
2581 		}
2582 
2583 		if (recipient == NULL || recipient == p) {
2584 			ret |= RCT_SIGNAL;
2585 
2586 			if (sqp == NULL) {
2587 				sigtoproc(p, NULL, recip_signal);
2588 			} else if (p == curproc) {
2589 				/*
2590 				 * Then this is a synchronous test and we can
2591 				 * direct the signal at the violating thread.
2592 				 */
2593 				sigaddqa(curproc, curthread, sqp);
2594 			} else {
2595 				sigaddqa(p, NULL, sqp);
2596 			}
2597 		} else if (!unobservable) {
2598 			proc_t *rp;
2599 
2600 			mutex_exit(&rset->rcs_lock);
2601 			mutex_exit(&p->p_lock);
2602 
2603 			mutex_enter(&pidlock);
2604 			if ((rp = prfind(recip_pid)) == recipient) {
2605 				/*
2606 				 * Recipient process is still alive, but may not
2607 				 * be in this task or project any longer.  In
2608 				 * this case, the recipient's resource control
2609 				 * set pertinent to this control will have
2610 				 * changed--and we will not deliver the signal,
2611 				 * as the recipient process is trying to tear
2612 				 * itself off of its former set.
2613 				 */
2614 				mutex_enter(&rp->p_lock);
2615 				mutex_exit(&pidlock);
2616 
2617 				if (rctl_entity_obtain_rset(rde, rp) == rset) {
2618 					ret |= RCT_SIGNAL;
2619 
2620 					if (sqp == NULL)
2621 						sigtoproc(rp, NULL,
2622 						    recip_signal);
2623 					else
2624 						sigaddqa(rp, NULL, sqp);
2625 				} else if (sqp) {
2626 					kmem_free(sqp, sizeof (sigqueue_t));
2627 				}
2628 				mutex_exit(&rp->p_lock);
2629 			} else {
2630 				mutex_exit(&pidlock);
2631 				if (sqp)
2632 					kmem_free(sqp, sizeof (sigqueue_t));
2633 			}
2634 
2635 			mutex_enter(&p->p_lock);
2636 			/*
2637 			 * Since we dropped p_lock, we may no longer be in the
2638 			 * same task or project as we were at entry.  It is thus
2639 			 * unsafe for us to reacquire the set lock at this
2640 			 * point; callers of rctl_local_action() must handle
2641 			 * this possibility.
2642 			 */
2643 			ret |= RCT_LK_ABANDONED;
2644 		} else if (sqp) {
2645 			kmem_free(sqp, sizeof (sigqueue_t));
2646 		}
2647 	}
2648 
2649 	if ((flagaction & RCTL_LOCAL_DENY) &&
2650 	    (recipient == NULL || recipient == p)) {
2651 		ret |= RCT_DENY;
2652 	}
2653 
2654 	return (ret);
2655 }
2656 
2657 /*
2658  * int rctl_action(rctl_hndl_t, rctl_set_t *, struct proc *, uint_t)
2659  *
2660  * Overview
2661  *   Take the action associated with the enforced value (as defined by
2662  *   rctl_get_enforced_value()) being exceeded or encountered.  Possibly perform
2663  *   a restricted subset of the available actions, if circumstances dictate that
2664  *   we cannot safely allocate memory (for a sigqueue_t) or guarantee process
2665  *   persistence across the duration of the function (an asynchronous action).
2666  *
2667  * Return values
2668  *   Actions taken, according to the rctl_test bitmask.
2669  *
2670  * Caller's context
2671  *   Safe to acquire rcs_lock.
2672  */
2673 int
2674 rctl_action(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p, uint_t safety)
2675 {
2676 	return (rctl_action_entity(hndl, rset, p, NULL, safety));
2677 }
2678 
2679 int
2680 rctl_action_entity(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p,
2681     rctl_entity_p_t *e, uint_t safety)
2682 {
2683 	int ret = RCT_NONE;
2684 	rctl_t *lrctl;
2685 	rctl_entity_p_t e_tmp;
2686 
2687 rctl_action_acquire:
2688 	mutex_enter(&rset->rcs_lock);
2689 	if (rctl_set_find(rset, hndl, &lrctl) == -1) {
2690 		mutex_exit(&rset->rcs_lock);
2691 		return (ret);
2692 	}
2693 
2694 	if (e == NULL) {
2695 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2696 		    p, &e_tmp);
2697 		e = &e_tmp;
2698 	}
2699 
2700 	if ((ret & RCT_LK_ABANDONED) == 0) {
2701 		ret |= rctl_global_action(lrctl, rset, p, lrctl->rc_cursor);
2702 
2703 		RCTLOP_ACTION(lrctl, p, e);
2704 
2705 		ret |= rctl_local_action(lrctl, rset, p,
2706 		    lrctl->rc_cursor, safety);
2707 
2708 		if (ret & RCT_LK_ABANDONED)
2709 			goto rctl_action_acquire;
2710 	}
2711 
2712 	ret &= ~RCT_LK_ABANDONED;
2713 
2714 	if (!(ret & RCT_DENY) &&
2715 	    lrctl->rc_cursor->rcv_next != NULL) {
2716 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2717 
2718 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2719 		    p, lrctl->rc_cursor->rcv_value));
2720 
2721 	}
2722 	mutex_exit(&rset->rcs_lock);
2723 
2724 	return (ret);
2725 }
2726 
2727 /*
2728  * int rctl_test(rctl_hndl_t, rctl_set_t *, struct proc *, rctl_qty_t, uint_t)
2729  *
2730  * Overview
2731  *   Increment the resource associated with the given handle, returning zero if
2732  *   the incremented value does not exceed the threshold for the current limit
2733  *   on the resource.
2734  *
2735  * Return values
2736  *   Actions taken, according to the rctl_test bitmask.
2737  *
2738  * Caller's context
2739  *   p_lock held by caller.
2740  */
2741 /*ARGSUSED*/
2742 int
2743 rctl_test(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2744     rctl_qty_t incr, uint_t flags)
2745 {
2746 	return (rctl_test_entity(rhndl, rset, p, NULL, incr, flags));
2747 }
2748 
2749 int
2750 rctl_test_entity(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2751     rctl_entity_p_t *e, rctl_qty_t incr, uint_t flags)
2752 {
2753 	rctl_t *lrctl;
2754 	int ret = RCT_NONE;
2755 	rctl_entity_p_t e_tmp;
2756 	if (p == &p0) {
2757 		/*
2758 		 * We don't enforce rctls on the kernel itself.
2759 		 */
2760 		return (ret);
2761 	}
2762 
2763 rctl_test_acquire:
2764 	ASSERT(MUTEX_HELD(&p->p_lock));
2765 
2766 	mutex_enter(&rset->rcs_lock);
2767 
2768 	/*
2769 	 * Dereference from rctl_set.  We don't enforce newly loaded controls
2770 	 * that haven't been set on this entity (since the only valid value is
2771 	 * the infinite system value).
2772 	 */
2773 	if (rctl_set_find(rset, rhndl, &lrctl) == -1) {
2774 		mutex_exit(&rset->rcs_lock);
2775 		return (ret);
2776 	}
2777 
2778 	/*
2779 	 * This control is currently unenforced:  maximal value on control
2780 	 * supporting infinitely available resource.
2781 	 */
2782 	if ((lrctl->rc_dict_entry->rcd_flagaction & RCTL_GLOBAL_INFINITE) &&
2783 	    (lrctl->rc_cursor->rcv_flagaction & RCTL_LOCAL_MAXIMAL)) {
2784 
2785 		mutex_exit(&rset->rcs_lock);
2786 		return (ret);
2787 	}
2788 
2789 	/*
2790 	 * If we have been called by rctl_test, look up the entity pointer
2791 	 * from the proc pointer.
2792 	 */
2793 	if (e == NULL) {
2794 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2795 		    p, &e_tmp);
2796 		e = &e_tmp;
2797 	}
2798 
2799 	/*
2800 	 * Get enforced rctl value and current usage.  Test the increment
2801 	 * with the current usage against the enforced value--take action as
2802 	 * necessary.
2803 	 */
2804 	while (RCTLOP_TEST(lrctl, p, e, lrctl->rc_cursor, incr, flags)) {
2805 		if ((ret & RCT_LK_ABANDONED) == 0) {
2806 			ret |= rctl_global_action(lrctl, rset, p,
2807 			    lrctl->rc_cursor);
2808 
2809 			RCTLOP_ACTION(lrctl, p, e);
2810 
2811 			ret |= rctl_local_action(lrctl, rset, p,
2812 			    lrctl->rc_cursor, flags);
2813 
2814 			if (ret & RCT_LK_ABANDONED)
2815 				goto rctl_test_acquire;
2816 		}
2817 
2818 		ret &= ~RCT_LK_ABANDONED;
2819 
2820 		if ((ret & RCT_DENY) == RCT_DENY ||
2821 		    lrctl->rc_cursor->rcv_next == NULL) {
2822 			ret |= RCT_DENY;
2823 			break;
2824 		}
2825 
2826 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2827 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2828 		    p, lrctl->rc_cursor->rcv_value));
2829 	}
2830 
2831 	mutex_exit(&rset->rcs_lock);
2832 
2833 	return (ret);
2834 }
2835 
2836 /*
2837  * void rctl_init(void)
2838  *
2839  * Overview
2840  *   Initialize the rctl subsystem, including the primoridal rctls
2841  *   provided by the system.  New subsystem-specific rctls should _not_ be
2842  *   initialized here.  (Do it in your own file.)
2843  *
2844  * Return values
2845  *   None.
2846  *
2847  * Caller's context
2848  *   Safe for KM_SLEEP allocations.  Must be called prior to any process model
2849  *   initialization.
2850  */
2851 void
2852 rctl_init(void)
2853 {
2854 	rctl_cache = kmem_cache_create("rctl_cache", sizeof (rctl_t),
2855 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2856 	rctl_val_cache = kmem_cache_create("rctl_val_cache",
2857 	    sizeof (rctl_val_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
2858 
2859 	rctl_dict = mod_hash_create_extended("rctl_dict",
2860 	    rctl_dict_size, mod_hash_null_keydtor, rctl_dict_val_dtor,
2861 	    rctl_dict_hash_by_id, NULL, rctl_dict_id_cmp, KM_SLEEP);
2862 	rctl_dict_by_name = mod_hash_create_strhash(
2863 	    "rctl_handles_by_name", rctl_dict_size,
2864 	    mod_hash_null_valdtor);
2865 	rctl_ids = id_space_create("rctl_ids", 1, max_rctl_hndl);
2866 	bzero(rctl_lists, (RC_MAX_ENTITY + 1) * sizeof (rctl_dict_entry_t *));
2867 
2868 	rctlproc_init();
2869 }
2870 
2871 /*
2872  * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2873  *     int chargeproc)
2874  *
2875  * Increments the amount of locked memory on a project, and
2876  * zone. If proj is non-NULL the project must be held by the
2877  * caller; if it is NULL the proj and zone of proc_t p are used.
2878  * If chargeproc is non-zero, then the charged amount is cached
2879  * on p->p_locked_mem so that the charge can be migrated when a
2880  * process changes projects.
2881  *
2882  * Return values
2883  *    0 - success
2884  *    EAGAIN - attempting to increment locked memory is denied by one
2885  *      or more resource entities.
2886  */
2887 int
2888 rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2889     int chargeproc)
2890 {
2891 	kproject_t *projp;
2892 	zone_t *zonep;
2893 	rctl_entity_p_t e;
2894 	int ret = 0;
2895 
2896 	ASSERT(p != NULL);
2897 	ASSERT(MUTEX_HELD(&p->p_lock));
2898 	if (proj != NULL) {
2899 		projp = proj;
2900 		zonep = proj->kpj_zone;
2901 	} else {
2902 		projp = p->p_task->tk_proj;
2903 		zonep = p->p_zone;
2904 	}
2905 
2906 	mutex_enter(&zonep->zone_mem_lock);
2907 
2908 	e.rcep_p.proj = projp;
2909 	e.rcep_t = RCENTITY_PROJECT;
2910 
2911 	/* check for overflow */
2912 	if ((projp->kpj_data.kpd_locked_mem + inc) <
2913 	    projp->kpj_data.kpd_locked_mem) {
2914 		ret = EAGAIN;
2915 		goto out;
2916 	}
2917 	if (projp->kpj_data.kpd_locked_mem + inc >
2918 	    projp->kpj_data.kpd_locked_mem_ctl) {
2919 		if (rctl_test_entity(rc_project_locked_mem, projp->kpj_rctls,
2920 		    p, &e, inc, 0) & RCT_DENY) {
2921 			ret = EAGAIN;
2922 			goto out;
2923 		}
2924 	}
2925 	e.rcep_p.zone = zonep;
2926 	e.rcep_t = RCENTITY_ZONE;
2927 
2928 	/* Check for overflow */
2929 	if ((zonep->zone_locked_mem + inc) < zonep->zone_locked_mem) {
2930 		ret = EAGAIN;
2931 		goto out;
2932 	}
2933 	if (zonep->zone_locked_mem + inc > zonep->zone_locked_mem_ctl) {
2934 		if (rctl_test_entity(rc_zone_locked_mem, zonep->zone_rctls,
2935 		    p, &e, inc, 0) & RCT_DENY) {
2936 			ret = EAGAIN;
2937 			goto out;
2938 		}
2939 	}
2940 
2941 	zonep->zone_locked_mem += inc;
2942 	projp->kpj_data.kpd_locked_mem += inc;
2943 	if (chargeproc != 0) {
2944 		p->p_locked_mem += inc;
2945 	}
2946 out:
2947 	mutex_exit(&zonep->zone_mem_lock);
2948 	return (ret);
2949 }
2950 
2951 /*
2952  * rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2953  *     int creditproc)
2954  *
2955  * Decrements the amount of locked memory on a project and
2956  * zone.  If proj is non-NULL the project must be held by the
2957  * caller; if it is NULL the proj and zone of proc_t p are used.
2958  * If creditproc is non-zero, then the quantity of locked memory
2959  * is subtracted from p->p_locked_mem.
2960  *
2961  * Return values
2962  *   none
2963  */
2964 void
2965 rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2966     int creditproc)
2967 {
2968 	kproject_t *projp;
2969 	zone_t *zonep;
2970 
2971 	if (proj != NULL) {
2972 		projp = proj;
2973 		zonep = proj->kpj_zone;
2974 	} else {
2975 		ASSERT(p != NULL);
2976 		ASSERT(MUTEX_HELD(&p->p_lock));
2977 		projp = p->p_task->tk_proj;
2978 		zonep = p->p_zone;
2979 	}
2980 
2981 	mutex_enter(&zonep->zone_mem_lock);
2982 	zonep->zone_locked_mem -= inc;
2983 	projp->kpj_data.kpd_locked_mem -= inc;
2984 	if (creditproc != 0) {
2985 		ASSERT(p != NULL);
2986 		ASSERT(MUTEX_HELD(&p->p_lock));
2987 		p->p_locked_mem -= inc;
2988 	}
2989 	mutex_exit(&zonep->zone_mem_lock);
2990 }
2991 
2992 /*
2993  * rctl_incr_swap(proc_t *, zone_t *, size_t)
2994  *
2995  * Overview
2996  *   Increments the swap charge on the specified zone.
2997  *
2998  * Return values
2999  *   0 on success.  EAGAIN if swap increment fails due an rctl value
3000  *   on the zone.
3001  *
3002  * Callers context
3003  *   p_lock held on specified proc.
3004  *   swap must be even multiple of PAGESIZE
3005  */
3006 int
3007 rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap)
3008 {
3009 	rctl_entity_p_t e;
3010 
3011 	ASSERT(MUTEX_HELD(&proc->p_lock));
3012 	ASSERT((swap & PAGEOFFSET) == 0);
3013 	e.rcep_p.zone = zone;
3014 	e.rcep_t = RCENTITY_ZONE;
3015 
3016 	mutex_enter(&zone->zone_mem_lock);
3017 
3018 	/* Check for overflow */
3019 	if ((zone->zone_max_swap + swap) < zone->zone_max_swap) {
3020 		mutex_exit(&zone->zone_mem_lock);
3021 		return (EAGAIN);
3022 	}
3023 	if ((zone->zone_max_swap + swap) >
3024 	    zone->zone_max_swap_ctl) {
3025 
3026 		if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls,
3027 		    proc, &e, swap, 0) & RCT_DENY) {
3028 			mutex_exit(&zone->zone_mem_lock);
3029 			return (EAGAIN);
3030 		}
3031 	}
3032 	zone->zone_max_swap += swap;
3033 	mutex_exit(&zone->zone_mem_lock);
3034 	return (0);
3035 }
3036 
3037 /*
3038  * rctl_decr_swap(zone_t *, size_t)
3039  *
3040  * Overview
3041  *   Decrements the swap charge on the specified zone.
3042  *
3043  * Return values
3044  *   None
3045  *
3046  * Callers context
3047  *   swap must be even multiple of PAGESIZE
3048  */
3049 void
3050 rctl_decr_swap(zone_t *zone, size_t swap)
3051 {
3052 	ASSERT((swap & PAGEOFFSET) == 0);
3053 	mutex_enter(&zone->zone_mem_lock);
3054 	ASSERT(zone->zone_max_swap >= swap);
3055 	zone->zone_max_swap -= swap;
3056 	mutex_exit(&zone->zone_mem_lock);
3057 }
3058 
3059 /*
3060  * rctl_incr_lofi(proc_t *, zone_t *, size_t)
3061  *
3062  * Overview
3063  *   Increments the number of lofi devices for the zone.
3064  *
3065  * Return values
3066  *   0 on success.  EAGAIN if increment fails due an rctl value
3067  *   on the zone.
3068  *
3069  * Callers context
3070  *   p_lock held on specified proc.
3071  */
3072 int
3073 rctl_incr_lofi(proc_t *proc, zone_t *zone, size_t incr)
3074 {
3075 	rctl_entity_p_t e;
3076 
3077 	ASSERT(MUTEX_HELD(&proc->p_lock));
3078 	ASSERT(incr > 0);
3079 
3080 	e.rcep_p.zone = zone;
3081 	e.rcep_t = RCENTITY_ZONE;
3082 
3083 	mutex_enter(&zone->zone_rctl_lock);
3084 
3085 	/* Check for overflow */
3086 	if ((zone->zone_max_lofi + incr) < zone->zone_max_lofi) {
3087 		mutex_exit(&zone->zone_rctl_lock);
3088 		return (EAGAIN);
3089 	}
3090 	if ((zone->zone_max_lofi + incr) > zone->zone_max_lofi_ctl) {
3091 		if (rctl_test_entity(rc_zone_max_lofi, zone->zone_rctls,
3092 		    proc, &e, incr, 0) & RCT_DENY) {
3093 			mutex_exit(&zone->zone_rctl_lock);
3094 			return (EAGAIN);
3095 		}
3096 	}
3097 	zone->zone_max_lofi += incr;
3098 	mutex_exit(&zone->zone_rctl_lock);
3099 	return (0);
3100 }
3101 
3102 /*
3103  * rctl_decr_lofi(zone_t *, size_t)
3104  *
3105  * Overview
3106  *   Decrements the number of lofi devices for the zone.
3107  */
3108 void
3109 rctl_decr_lofi(zone_t *zone, size_t decr)
3110 {
3111 	mutex_enter(&zone->zone_rctl_lock);
3112 	ASSERT(zone->zone_max_lofi >= decr);
3113 	zone->zone_max_lofi -= decr;
3114 	mutex_exit(&zone->zone_rctl_lock);
3115 }
3116 
3117 /*
3118  * Create resource kstat
3119  */
3120 static kstat_t *
3121 rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class,
3122     uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid)
3123 {
3124 	kstat_t *ksp = NULL;
3125 	char name[KSTAT_STRLEN];
3126 
3127 	(void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance);
3128 
3129 	if ((ksp = kstat_create_zone("caps", ks_zoneid,
3130 	    name, ks_class, ks_type,
3131 	    ks_ndata, ks_flags, ks_zoneid)) != NULL) {
3132 		if (ks_zoneid != GLOBAL_ZONEID)
3133 			kstat_zone_add(ksp, GLOBAL_ZONEID);
3134 	}
3135 	return (ksp);
3136 }
3137 
3138 /*
3139  * Create zone-specific resource kstat
3140  */
3141 kstat_t *
3142 rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type,
3143     uint_t ks_ndata, uchar_t ks_flags)
3144 {
3145 	char name[KSTAT_STRLEN];
3146 
3147 	(void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name);
3148 
3149 	return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps",
3150 	    ks_type, ks_ndata, ks_flags, zone->zone_id));
3151 }
3152 
3153 /*
3154  * Create project-specific resource kstat
3155  */
3156 kstat_t *
3157 rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type,
3158     uint_t ks_ndata, uchar_t ks_flags)
3159 {
3160 	char name[KSTAT_STRLEN];
3161 
3162 	(void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name);
3163 
3164 	return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps",
3165 	    ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid));
3166 }
3167 
3168 /*
3169  * Create task-specific resource kstat
3170  */
3171 kstat_t *
3172 rctl_kstat_create_task(task_t *tk, char *ks_name, uchar_t ks_type,
3173     uint_t ks_ndata, uchar_t ks_flags)
3174 {
3175 	char name[KSTAT_STRLEN];
3176 
3177 	(void) snprintf(name, KSTAT_STRLEN, "%s_task", ks_name);
3178 
3179 	return (rctl_kstat_create_common(name, tk->tk_tkid, "task_caps",
3180 	    ks_type, ks_ndata, ks_flags, tk->tk_proj->kpj_zoneid));
3181 }
3182