xref: /titanic_41/usr/src/uts/common/os/rctl.c (revision c0c79a3f09914f35651895ffc111883455b7f62d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/atomic.h>
29 #include <sys/cmn_err.h>
30 #include <sys/id_space.h>
31 #include <sys/kmem.h>
32 #include <sys/kstat.h>
33 #include <sys/log.h>
34 #include <sys/modctl.h>
35 #include <sys/modhash.h>
36 #include <sys/mutex.h>
37 #include <sys/proc.h>
38 #include <sys/procset.h>
39 #include <sys/project.h>
40 #include <sys/resource.h>
41 #include <sys/rctl.h>
42 #include <sys/siginfo.h>
43 #include <sys/strlog.h>
44 #include <sys/systm.h>
45 #include <sys/task.h>
46 #include <sys/types.h>
47 #include <sys/policy.h>
48 #include <sys/zone.h>
49 
50 /*
51  * Resource controls (rctls)
52  *
53  *   The rctl subsystem provides a mechanism for kernel components to
54  *   register their individual resource controls with the system as a whole,
55  *   such that those controls can subscribe to specific actions while being
56  *   associated with the various process-model entities provided by the kernel:
57  *   the process, the task, the project, and the zone.  (In principle, only
58  *   minor modifications would be required to connect the resource control
59  *   functionality to non-process-model entities associated with the system.)
60  *
61  *   Subsystems register their rctls via rctl_register().  Subsystems
62  *   also wishing to provide additional limits on a given rctl can modify
63  *   them once they have the rctl handle.  Each subsystem should store the
64  *   handle to their rctl for direct access.
65  *
66  *   A primary dictionary, rctl_dict, contains a hash of id to the default
67  *   control definition for each controlled resource-entity pair on the system.
68  *   A secondary dictionary, rctl_dict_by_name, contains a hash of name to
69  *   resource control handles.  The resource control handles are distributed by
70  *   the rctl_ids ID space.  The handles are private and not to be
71  *   advertised to userland; all userland interactions are via the rctl
72  *   names.
73  *
74  *   Entities inherit their rctls from their predecessor.  Since projects have
75  *   no ancestor, they inherit their rctls from the rctl dict for project
76  *   rctls.  It is expected that project controls will be set to their
77  *   appropriate values shortly after project creation, presumably from a
78  *   policy source such as the project database.
79  *
80  * Data structures
81  *   The rctl_set_t attached to each of the process model entities is a simple
82  *   hash table keyed on the rctl handle assigned at registration.  The entries
83  *   in the hash table are rctl_t's, whose relationship with the active control
84  *   values on that resource and with the global state of the resource we
85  *   illustrate below:
86  *
87  *   rctl_dict[key] --> rctl_dict_entry
88  *			   ^
89  *			   |
90  *			+--+---+
91  *   rctl_set[key] ---> | rctl | --> value <-> value <-> system value --> NULL
92  *			+--+---+		 ^
93  *			   |			 |
94  *			   +------- cursor ------+
95  *
96  *   That is, the rctl contains a back pointer to the global resource control
97  *   state for this resource, which is also available in the rctl_dict hash
98  *   table mentioned earlier.  The rctl contains two pointers to resource
99  *   control values:  one, values, indicates the entire sequence of control
100  *   values; the other, cursor, indicates the currently active control
101  *   value--the next value to be enforced.  The value list itself is an open,
102  *   doubly-linked list, the last non-NULL member of which is the system value
103  *   for that resource (being the theoretical/conventional maximum allowable
104  *   value for the resource on this OS instance).
105  *
106  * Ops Vector
107  *   Subsystems publishing rctls need not provide instances of all of the
108  *   functions specified by the ops vector.  In particular, if general
109  *   rctl_*() entry points are not being called, certain functions can be
110  *   omitted.  These align as follows:
111  *
112  *   rctl_set()
113  *     You may wish to provide a set callback if locking circumstances prevent
114  *     it or if the performance cost of requesting the enforced value from the
115  *     resource control is prohibitively expensive.  For instance, the currently
116  *     enforced file size limit is stored on the process in the p_fsz_ctl to
117  *     maintain read()/write() performance.
118  *
119  *   rctl_test()
120  *     You must provide a test callback if you are using the rctl_test()
121  *     interface.  An action callback is optional.
122  *
123  *   rctl_action()
124  *     You may wish to provide an action callback.
125  *
126  * Registration
127  *   New resource controls can be added to a running instance by loaded modules
128  *   via registration.  (The current implementation does not support unloadable
129  *   modules; this functionality can be added if needed, via an
130  *   activation/deactivation interface involving the manipulation of the
131  *   ops vector for the resource control(s) needing to support unloading.)
132  *
133  * Control value ordering
134  *   Because the rctl_val chain on each rctl must be navigable in a
135  *   deterministic way, we have to define an ordering on the rctl_val_t's.  The
136  *   defined order is (flags & [maximal], value, flags & [deny-action],
137  *   privilege).
138  *
139  * Locking
140  *   rctl_dict_lock must be acquired prior to rctl_lists_lock.  Since
141  *   rctl_dict_lock or rctl_lists_lock can be called at the enforcement point
142  *   of any subsystem, holding subsystem locks, it is at all times inappropriate
143  *   to call kmem_alloc(., KM_SLEEP) while holding either of these locks.
144  *   Traversing any of the various resource control entity lists requires
145  *   holding rctl_lists_lock.
146  *
147  *   Each individual resource control set associated with an entity must have
148  *   its rcs_lock held for the duration of any operations that would add
149  *   resource controls or control values to the set.
150  *
151  *   The locking subsequence of interest is: p_lock, rctl_dict_lock,
152  *   rctl_lists_lock, entity->rcs_lock.
153  *
154  * The projects(4) database and project entity resource controls
155  *   A special case is made for RCENTITY_PROJECT values set through the
156  *   setproject(3PROJECT) interface.  setproject() makes use of a private
157  *   interface, setprojrctl(), which passes through an array of resource control
158  *   blocks that need to be set while holding the entity->rcs_lock.  This
159  *   ensures that the act of modifying a project's resource controls is
160  *   "atomic" within the kernel.
161  *
162  *   Within the rctl sub-system, we provide two interfaces that are only used by
163  *   the setprojrctl() code path - rctl_local_insert_all() and
164  *   rctl_local_replace_all().  rctl_local_insert_all() will ensure that the
165  *   resource values specified in *new_values are applied.
166  *   rctl_local_replace_all() will purge the current rctl->rc_projdb and
167  *   rctl->rc_values entries, and apply the *new_values.
168  *
169  *   These functions modify not only the linked list of active resource controls
170  *   (rctl->rc_values), but also a "cached" linked list (rctl->rc_projdb) of
171  *   values set through these interfaces.  To clarify:
172  *
173  *      rctl->rc_values - a linked list of rctl_val_t.  These are the active
174  *      resource values associated with this rctl, and may have been set by
175  *      setrctl() - via prctl(1M), or by setprojrctl() - via
176  *      setproject(3PROJECT).
177  *
178  *      rctl->rc_projdb - a linked list of rctl_val_t.  These reflect the
179  *      resource values set by the setprojrctl() code path.  rc_projdb is not
180  *      referenced by any other component of the rctl sub-system.
181  *
182  *   As various locks are held when calling these functions, we ensure that all
183  *   the possible memory allocations are performed prior to calling the
184  *   function.  *alloc_values is a linked list of uninitialized rctl_val_t,
185  *   which may be used to duplicate a new resource control value (passed in as
186  *   one of the members of the *new_values linked list), in order to populate
187  *   rctl->rc_values.
188  */
189 
190 id_t max_rctl_hndl = 32768;
191 int rctl_dict_size = 64;
192 int rctl_set_size = 8;
193 kmutex_t rctl_dict_lock;
194 mod_hash_t *rctl_dict;
195 mod_hash_t *rctl_dict_by_name;
196 id_space_t *rctl_ids;
197 kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
198 kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
199 
200 kmutex_t rctl_lists_lock;
201 rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
202 
203 /*
204  * Default resource control operations and ops vector
205  *   To be used if the particular rcontrol has no specific actions defined, or
206  *   if the subsystem providing the control is quiescing (in preparation for
207  *   unloading, presumably.)
208  *
209  *   Resource controls with callbacks should fill the unused operations with the
210  *   appropriate default impotent callback.
211  */
212 /*ARGSUSED*/
213 void
214 rcop_no_action(struct rctl *r, struct proc *p, rctl_entity_p_t *e)
215 {
216 }
217 
218 /*ARGSUSED*/
219 rctl_qty_t
220 rcop_no_usage(struct rctl *r, struct proc *p)
221 {
222 	return (0);
223 }
224 
225 /*ARGSUSED*/
226 int
227 rcop_no_set(struct rctl *r, struct proc *p, rctl_entity_p_t *e, rctl_qty_t l)
228 {
229 	return (0);
230 }
231 
232 /*ARGSUSED*/
233 int
234 rcop_no_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
235     struct rctl_val *rv, rctl_qty_t i, uint_t f)
236 {
237 	return (0);
238 }
239 
240 rctl_ops_t rctl_default_ops = {
241 	rcop_no_action,
242 	rcop_no_usage,
243 	rcop_no_set,
244 	rcop_no_test
245 };
246 
247 /*
248  * Default "absolute" resource control operation and ops vector
249  *   Useful if there is no usage associated with the
250  *   resource control.
251  */
252 /*ARGSUSED*/
253 int
254 rcop_absolute_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
255     struct rctl_val *rv, rctl_qty_t i, uint_t f)
256 {
257 	return (i > rv->rcv_value);
258 }
259 
260 rctl_ops_t rctl_absolute_ops = {
261 	rcop_no_action,
262 	rcop_no_usage,
263 	rcop_no_set,
264 	rcop_absolute_test
265 };
266 
267 /*ARGSUSED*/
268 static uint_t
269 rctl_dict_hash_by_id(void *hash_data, mod_hash_key_t key)
270 {
271 	return ((uint_t)(uintptr_t)key % rctl_dict_size);
272 }
273 
274 static int
275 rctl_dict_id_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
276 {
277 	uint_t u1 = (uint_t)(uintptr_t)key1;
278 	uint_t u2 = (uint_t)(uintptr_t)key2;
279 
280 	if (u1 > u2)
281 		return (1);
282 
283 	if (u1 == u2)
284 		return (0);
285 
286 	return (-1);
287 }
288 
289 static void
290 rctl_dict_val_dtor(mod_hash_val_t val)
291 {
292 	rctl_dict_entry_t *kr = (rctl_dict_entry_t *)val;
293 
294 	kmem_free(kr, sizeof (rctl_dict_entry_t));
295 }
296 
297 /*
298  * size_t rctl_build_name_buf()
299  *
300  * Overview
301  *   rctl_build_name_buf() walks all active resource controls in the dictionary,
302  *   building a buffer of continguous NUL-terminated strings.
303  *
304  * Return values
305  *   The size of the buffer is returned, the passed pointer's contents are
306  *   modified to that of the location of the buffer.
307  *
308  * Caller's context
309  *   Caller must be in a context suitable for KM_SLEEP allocations.
310  */
311 size_t
312 rctl_build_name_buf(char **rbufp)
313 {
314 	size_t req_size, cpy_size;
315 	char *rbufloc;
316 	int i;
317 
318 rctl_rebuild_name_buf:
319 	req_size = cpy_size = 0;
320 
321 	/*
322 	 * Calculate needed buffer length.
323 	 */
324 	mutex_enter(&rctl_lists_lock);
325 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
326 		rctl_dict_entry_t *rde;
327 
328 		for (rde = rctl_lists[i];
329 		    rde != NULL;
330 		    rde = rde->rcd_next)
331 			req_size += strlen(rde->rcd_name) + 1;
332 	}
333 	mutex_exit(&rctl_lists_lock);
334 
335 	rbufloc = *rbufp = kmem_alloc(req_size, KM_SLEEP);
336 
337 	/*
338 	 * Copy rctl names into our buffer.  If the copy length exceeds the
339 	 * allocate length (due to registration changes), stop copying, free the
340 	 * buffer, and start again.
341 	 */
342 	mutex_enter(&rctl_lists_lock);
343 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
344 		rctl_dict_entry_t *rde;
345 
346 		for (rde = rctl_lists[i];
347 		    rde != NULL;
348 		    rde = rde->rcd_next) {
349 			size_t length = strlen(rde->rcd_name) + 1;
350 
351 			cpy_size += length;
352 
353 			if (cpy_size > req_size) {
354 				kmem_free(*rbufp, req_size);
355 				mutex_exit(&rctl_lists_lock);
356 				goto rctl_rebuild_name_buf;
357 			}
358 
359 			bcopy(rde->rcd_name, rbufloc, length);
360 			rbufloc += length;
361 		}
362 	}
363 	mutex_exit(&rctl_lists_lock);
364 
365 	return (req_size);
366 }
367 
368 /*
369  * rctl_dict_entry_t *rctl_dict_lookup(const char *)
370  *
371  * Overview
372  *   rctl_dict_lookup() returns the resource control dictionary entry for the
373  *   named resource control.
374  *
375  * Return values
376  *   A pointer to the appropriate resource control dictionary entry, or NULL if
377  *   no such named entry exists.
378  *
379  * Caller's context
380  *   Caller must not be holding rctl_dict_lock.
381  */
382 rctl_dict_entry_t *
383 rctl_dict_lookup(const char *name)
384 {
385 	rctl_dict_entry_t *rde;
386 
387 	mutex_enter(&rctl_dict_lock);
388 
389 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
390 	    (mod_hash_val_t *)&rde) == MH_ERR_NOTFOUND) {
391 		mutex_exit(&rctl_dict_lock);
392 		return (NULL);
393 	}
394 
395 	mutex_exit(&rctl_dict_lock);
396 
397 	return (rde);
398 }
399 
400 /*
401  * rctl_hndl_t rctl_hndl_lookup(const char *)
402  *
403  * Overview
404  *   rctl_hndl_lookup() returns the resource control id (the "handle") for the
405  *   named resource control.
406  *
407  * Return values
408  *   The appropriate id, or -1 if no such named entry exists.
409  *
410  * Caller's context
411  *   Caller must not be holding rctl_dict_lock.
412  */
413 rctl_hndl_t
414 rctl_hndl_lookup(const char *name)
415 {
416 	rctl_dict_entry_t *rde;
417 
418 	if ((rde = rctl_dict_lookup(name)) == NULL)
419 		return (-1);
420 
421 	return (rde->rcd_id);
422 }
423 
424 /*
425  * rctl_dict_entry_t * rctl_dict_lookup_hndl(rctl_hndl_t)
426  *
427  * Overview
428  *   rctl_dict_lookup_hndl() completes the public lookup functions, by returning
429  *   the resource control dictionary entry matching a given resource control id.
430  *
431  * Return values
432  *   A pointer to the matching resource control dictionary entry, or NULL if the
433  *   id does not match any existing entries.
434  *
435  * Caller's context
436  *   Caller must not be holding rctl_lists_lock.
437  */
438 rctl_dict_entry_t *
439 rctl_dict_lookup_hndl(rctl_hndl_t hndl)
440 {
441 	uint_t i;
442 
443 	mutex_enter(&rctl_lists_lock);
444 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
445 		rctl_dict_entry_t *rde;
446 
447 		for (rde = rctl_lists[i];
448 		    rde != NULL;
449 		    rde = rde->rcd_next)
450 			if (rde->rcd_id == hndl) {
451 				mutex_exit(&rctl_lists_lock);
452 				return (rde);
453 			}
454 	}
455 	mutex_exit(&rctl_lists_lock);
456 
457 	return (NULL);
458 }
459 
460 /*
461  * void rctl_add_default_limit(const char *name, rctl_qty_t value,
462  *     rctl_priv_t privilege, uint_t action)
463  *
464  * Overview
465  *   Create a default limit with specified value, privilege, and action.
466  *
467  * Return value
468  *   No value returned.
469  */
470 void
471 rctl_add_default_limit(const char *name, rctl_qty_t value,
472     rctl_priv_t privilege, uint_t action)
473 {
474 	rctl_val_t *dval;
475 	rctl_dict_entry_t *rde;
476 
477 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
478 	bzero(dval, sizeof (rctl_val_t));
479 	dval->rcv_value = value;
480 	dval->rcv_privilege = privilege;
481 	dval->rcv_flagaction = action;
482 	dval->rcv_action_recip_pid = -1;
483 
484 	rde = rctl_dict_lookup(name);
485 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
486 }
487 
488 /*
489  * void rctl_add_legacy_limit(const char *name, const char *mname,
490  *     const char *lname, rctl_qty_t dflt)
491  *
492  * Overview
493  *   Create a default privileged limit, using the value obtained from
494  *   /etc/system if it exists and is greater than the specified default
495  *   value.  Exists primarily for System V IPC.
496  *
497  * Return value
498  *   No value returned.
499  */
500 void
501 rctl_add_legacy_limit(const char *name, const char *mname, const char *lname,
502     rctl_qty_t dflt, rctl_qty_t max)
503 {
504 	rctl_qty_t qty;
505 
506 	if (!mod_sysvar(mname, lname, &qty) || (qty < dflt))
507 		qty = dflt;
508 
509 	if (qty > max)
510 		qty = max;
511 
512 	rctl_add_default_limit(name, qty, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
513 }
514 
515 static rctl_set_t *
516 rctl_entity_obtain_rset(rctl_dict_entry_t *rcd, struct proc *p)
517 {
518 	rctl_set_t *rset = NULL;
519 
520 	if (rcd == NULL)
521 		return (NULL);
522 
523 	switch (rcd->rcd_entity) {
524 	case RCENTITY_PROCESS:
525 		rset = p->p_rctls;
526 		break;
527 	case RCENTITY_TASK:
528 		ASSERT(MUTEX_HELD(&p->p_lock));
529 		if (p->p_task != NULL)
530 			rset = p->p_task->tk_rctls;
531 		break;
532 	case RCENTITY_PROJECT:
533 		ASSERT(MUTEX_HELD(&p->p_lock));
534 		if (p->p_task != NULL &&
535 		    p->p_task->tk_proj != NULL)
536 			rset = p->p_task->tk_proj->kpj_rctls;
537 		break;
538 	case RCENTITY_ZONE:
539 		ASSERT(MUTEX_HELD(&p->p_lock));
540 		if (p->p_zone != NULL)
541 			rset = p->p_zone->zone_rctls;
542 		break;
543 	default:
544 		panic("unknown rctl entity type %d seen", rcd->rcd_entity);
545 		break;
546 	}
547 
548 	return (rset);
549 }
550 
551 static void
552 rctl_entity_obtain_entity_p(rctl_entity_t entity, struct proc *p,
553     rctl_entity_p_t *e)
554 {
555 	e->rcep_p.proc = NULL;
556 	e->rcep_t = entity;
557 
558 	switch (entity) {
559 	case RCENTITY_PROCESS:
560 		e->rcep_p.proc = p;
561 		break;
562 	case RCENTITY_TASK:
563 		ASSERT(MUTEX_HELD(&p->p_lock));
564 		if (p->p_task != NULL)
565 			e->rcep_p.task = p->p_task;
566 		break;
567 	case RCENTITY_PROJECT:
568 		ASSERT(MUTEX_HELD(&p->p_lock));
569 		if (p->p_task != NULL &&
570 		    p->p_task->tk_proj != NULL)
571 			e->rcep_p.proj = p->p_task->tk_proj;
572 		break;
573 	case RCENTITY_ZONE:
574 		ASSERT(MUTEX_HELD(&p->p_lock));
575 		if (p->p_zone != NULL)
576 			e->rcep_p.zone = p->p_zone;
577 		break;
578 	default:
579 		panic("unknown rctl entity type %d seen", entity);
580 		break;
581 	}
582 }
583 
584 static void
585 rctl_gp_alloc(rctl_alloc_gp_t *rcgp)
586 {
587 	uint_t i;
588 
589 	if (rcgp->rcag_nctls > 0) {
590 		rctl_t *prev = kmem_cache_alloc(rctl_cache, KM_SLEEP);
591 		rctl_t *rctl = prev;
592 
593 		rcgp->rcag_ctls = prev;
594 
595 		for (i = 1; i < rcgp->rcag_nctls; i++) {
596 			rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
597 			prev->rc_next = rctl;
598 			prev = rctl;
599 		}
600 
601 		rctl->rc_next = NULL;
602 	}
603 
604 	if (rcgp->rcag_nvals > 0) {
605 		rctl_val_t *prev = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
606 		rctl_val_t *rval = prev;
607 
608 		rcgp->rcag_vals = prev;
609 
610 		for (i = 1; i < rcgp->rcag_nvals; i++) {
611 			rval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
612 			prev->rcv_next = rval;
613 			prev = rval;
614 		}
615 
616 		rval->rcv_next = NULL;
617 	}
618 
619 }
620 
621 static rctl_val_t *
622 rctl_gp_detach_val(rctl_alloc_gp_t *rcgp)
623 {
624 	rctl_val_t *rval = rcgp->rcag_vals;
625 
626 	ASSERT(rcgp->rcag_nvals > 0);
627 	rcgp->rcag_nvals--;
628 	rcgp->rcag_vals = rval->rcv_next;
629 
630 	rval->rcv_next = NULL;
631 
632 	return (rval);
633 }
634 
635 static rctl_t *
636 rctl_gp_detach_ctl(rctl_alloc_gp_t *rcgp)
637 {
638 	rctl_t *rctl = rcgp->rcag_ctls;
639 
640 	ASSERT(rcgp->rcag_nctls > 0);
641 	rcgp->rcag_nctls--;
642 	rcgp->rcag_ctls = rctl->rc_next;
643 
644 	rctl->rc_next = NULL;
645 
646 	return (rctl);
647 
648 }
649 
650 static void
651 rctl_gp_free(rctl_alloc_gp_t *rcgp)
652 {
653 	rctl_val_t *rval = rcgp->rcag_vals;
654 	rctl_t *rctl = rcgp->rcag_ctls;
655 
656 	while (rval != NULL) {
657 		rctl_val_t *next = rval->rcv_next;
658 
659 		kmem_cache_free(rctl_val_cache, rval);
660 		rval = next;
661 	}
662 
663 	while (rctl != NULL) {
664 		rctl_t *next = rctl->rc_next;
665 
666 		kmem_cache_free(rctl_cache, rctl);
667 		rctl = next;
668 	}
669 }
670 
671 /*
672  * void rctl_prealloc_destroy(rctl_alloc_gp_t *)
673  *
674  * Overview
675  *   Release all unused memory allocated via one of the "prealloc" functions:
676  *   rctl_set_init_prealloc, rctl_set_dup_prealloc, or rctl_rlimit_set_prealloc.
677  *
678  * Return values
679  *   None.
680  *
681  * Caller's context
682  *   No restrictions on context.
683  */
684 void
685 rctl_prealloc_destroy(rctl_alloc_gp_t *gp)
686 {
687 	rctl_gp_free(gp);
688 	kmem_free(gp, sizeof (rctl_alloc_gp_t));
689 }
690 
691 /*
692  * int rctl_val_cmp(rctl_val_t *, rctl_val_t *, int)
693  *
694  * Overview
695  *   This function defines an ordering to rctl_val_t's in order to allow
696  *   for correct placement in value lists. When the imprecise flag is set,
697  *   the action recipient is ignored. This is to facilitate insert,
698  *   delete, and replace operations by rctlsys.
699  *
700  * Return values
701  *   0 if the val_t's are are considered identical
702  *   -1 if a is ordered lower than b
703  *   1 if a is lowered higher than b
704  *
705  * Caller's context
706  *   No restrictions on context.
707  */
708 int
709 rctl_val_cmp(rctl_val_t *a, rctl_val_t *b, int imprecise)
710 {
711 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) <
712 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
713 		return (-1);
714 
715 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) >
716 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
717 		return (1);
718 
719 	if (a->rcv_value < b->rcv_value)
720 		return (-1);
721 
722 	if (a->rcv_value > b->rcv_value)
723 		return (1);
724 
725 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) <
726 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
727 		return (-1);
728 
729 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) >
730 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
731 		return (1);
732 
733 	if (a->rcv_privilege < b->rcv_privilege)
734 		return (-1);
735 
736 	if (a->rcv_privilege > b->rcv_privilege)
737 		return (1);
738 
739 	if (imprecise)
740 		return (0);
741 
742 	if (a->rcv_action_recip_pid < b->rcv_action_recip_pid)
743 		return (-1);
744 
745 	if (a->rcv_action_recip_pid > b->rcv_action_recip_pid)
746 		return (1);
747 
748 	return (0);
749 }
750 
751 static rctl_val_t *
752 rctl_val_list_find(rctl_val_t **head, rctl_val_t *cval)
753 {
754 	rctl_val_t *rval = *head;
755 
756 	while (rval != NULL) {
757 		if (rctl_val_cmp(cval, rval, 0) == 0)
758 			return (rval);
759 
760 		rval = rval->rcv_next;
761 	}
762 
763 	return (NULL);
764 
765 }
766 
767 /*
768  * int rctl_val_list_insert(rctl_val_t **, rctl_val_t *)
769  *
770  * Overview
771  *   This function inserts the rctl_val_t into the value list provided.
772  *   The insert is always successful unless if the value is a duplicate
773  *   of one already in the list.
774  *
775  * Return values
776  *    1 if the value was a duplicate of an existing value in the list.
777  *    0 if the insert was successful.
778  */
779 int
780 rctl_val_list_insert(rctl_val_t **root, rctl_val_t *rval)
781 {
782 	rctl_val_t *prev;
783 	int equiv;
784 
785 	rval->rcv_next = NULL;
786 	rval->rcv_prev = NULL;
787 
788 	if (*root == NULL) {
789 		*root = rval;
790 		return (0);
791 	}
792 
793 	equiv = rctl_val_cmp(rval, *root, 0);
794 
795 	if (equiv == 0)
796 		return (1);
797 
798 	if (equiv < 0) {
799 		rval->rcv_next = *root;
800 		rval->rcv_next->rcv_prev = rval;
801 		*root = rval;
802 
803 		return (0);
804 	}
805 
806 	prev = *root;
807 	while (prev->rcv_next != NULL &&
808 	    (equiv = rctl_val_cmp(rval, prev->rcv_next, 0)) > 0) {
809 		prev = prev->rcv_next;
810 	}
811 
812 	if (equiv == 0)
813 		return (1);
814 
815 	rval->rcv_next = prev->rcv_next;
816 	if (rval->rcv_next != NULL)
817 		rval->rcv_next->rcv_prev = rval;
818 	prev->rcv_next = rval;
819 	rval->rcv_prev = prev;
820 
821 	return (0);
822 }
823 
824 static int
825 rctl_val_list_delete(rctl_val_t **root, rctl_val_t *rval)
826 {
827 	rctl_val_t *prev;
828 
829 	if (*root == NULL)
830 		return (-1);
831 
832 	prev = *root;
833 	if (rctl_val_cmp(rval, prev, 0) == 0) {
834 		*root = prev->rcv_next;
835 		(*root)->rcv_prev = NULL;
836 
837 		kmem_cache_free(rctl_val_cache, prev);
838 
839 		return (0);
840 	}
841 
842 	while (prev->rcv_next != NULL &&
843 	    rctl_val_cmp(rval, prev->rcv_next, 0) != 0) {
844 		prev = prev->rcv_next;
845 	}
846 
847 	if (prev->rcv_next == NULL) {
848 		/*
849 		 * If we navigate the entire list and cannot find a match, then
850 		 * return failure.
851 		 */
852 		return (-1);
853 	}
854 
855 	prev = prev->rcv_next;
856 	prev->rcv_prev->rcv_next = prev->rcv_next;
857 	if (prev->rcv_next != NULL)
858 		prev->rcv_next->rcv_prev = prev->rcv_prev;
859 
860 	kmem_cache_free(rctl_val_cache, prev);
861 
862 	return (0);
863 }
864 
865 static rctl_val_t *
866 rctl_val_list_dup(rctl_val_t *rval, rctl_alloc_gp_t *ragp, struct proc *oldp,
867     struct proc *newp)
868 {
869 	rctl_val_t *head = NULL;
870 
871 	for (; rval != NULL; rval = rval->rcv_next) {
872 		rctl_val_t *dval = rctl_gp_detach_val(ragp);
873 
874 		bcopy(rval, dval, sizeof (rctl_val_t));
875 		dval->rcv_prev = dval->rcv_next = NULL;
876 
877 		if (oldp == NULL ||
878 		    rval->rcv_action_recipient == NULL ||
879 		    rval->rcv_action_recipient == oldp) {
880 			if (rval->rcv_privilege == RCPRIV_BASIC) {
881 				dval->rcv_action_recipient = newp;
882 				dval->rcv_action_recip_pid = newp->p_pid;
883 			} else {
884 				dval->rcv_action_recipient = NULL;
885 				dval->rcv_action_recip_pid = -1;
886 			}
887 
888 			(void) rctl_val_list_insert(&head, dval);
889 		} else {
890 			kmem_cache_free(rctl_val_cache, dval);
891 		}
892 	}
893 
894 	return (head);
895 }
896 
897 static void
898 rctl_val_list_reset(rctl_val_t *rval)
899 {
900 	for (; rval != NULL; rval = rval->rcv_next)
901 		rval->rcv_firing_time = 0;
902 }
903 
904 static uint_t
905 rctl_val_list_count(rctl_val_t *rval)
906 {
907 	uint_t n = 0;
908 
909 	for (; rval != NULL; rval = rval->rcv_next)
910 		n++;
911 
912 	return (n);
913 }
914 
915 
916 static void
917 rctl_val_list_free(rctl_val_t *rval)
918 {
919 	while (rval != NULL) {
920 		rctl_val_t *next = rval->rcv_next;
921 
922 		kmem_cache_free(rctl_val_cache, rval);
923 
924 		rval = next;
925 	}
926 }
927 
928 /*
929  * rctl_qty_t rctl_model_maximum(rctl_dict_entry_t *, struct proc *)
930  *
931  * Overview
932  *   In cases where the operating system supports more than one process
933  *   addressing model, the operating system capabilities will exceed those of
934  *   one or more of these models.  Processes in a less capable model must have
935  *   their resources accurately controlled, without diluting those of their
936  *   descendants reached via exec().  rctl_model_maximum() returns the governing
937  *   value for the specified process with respect to a resource control, such
938  *   that the value can used for the RCTLOP_SET callback or compatability
939  *   support.
940  *
941  * Return values
942  *   The maximum value for the given process for the specified resource control.
943  *
944  * Caller's context
945  *   No restrictions on context.
946  */
947 rctl_qty_t
948 rctl_model_maximum(rctl_dict_entry_t *rde, struct proc *p)
949 {
950 	if (p->p_model == DATAMODEL_NATIVE)
951 		return (rde->rcd_max_native);
952 
953 	return (rde->rcd_max_ilp32);
954 }
955 
956 /*
957  * rctl_qty_t rctl_model_value(rctl_dict_entry_t *, struct proc *, rctl_qty_t)
958  *
959  * Overview
960  *   Convenience function wrapping the rctl_model_maximum() functionality.
961  *
962  * Return values
963  *   The lesser of the process's maximum value and the given value for the
964  *   specified resource control.
965  *
966  * Caller's context
967  *   No restrictions on context.
968  */
969 rctl_qty_t
970 rctl_model_value(rctl_dict_entry_t *rde, struct proc *p, rctl_qty_t value)
971 {
972 	rctl_qty_t max = rctl_model_maximum(rde, p);
973 
974 	return (value < max ? value : max);
975 }
976 
977 static void
978 rctl_set_insert(rctl_set_t *set, rctl_hndl_t hndl, rctl_t *rctl)
979 {
980 	uint_t index = hndl % rctl_set_size;
981 	rctl_t *next_ctl, *prev_ctl;
982 
983 	ASSERT(MUTEX_HELD(&set->rcs_lock));
984 
985 	rctl->rc_next = NULL;
986 
987 	if (set->rcs_ctls[index] == NULL) {
988 		set->rcs_ctls[index] = rctl;
989 		return;
990 	}
991 
992 	if (hndl < set->rcs_ctls[index]->rc_id) {
993 		rctl->rc_next = set->rcs_ctls[index];
994 		set->rcs_ctls[index] = rctl;
995 
996 		return;
997 	}
998 
999 	for (next_ctl = set->rcs_ctls[index]->rc_next,
1000 	    prev_ctl = set->rcs_ctls[index];
1001 	    next_ctl != NULL;
1002 	    prev_ctl = next_ctl,
1003 	    next_ctl = next_ctl->rc_next) {
1004 		if (next_ctl->rc_id > hndl) {
1005 			rctl->rc_next = next_ctl;
1006 			prev_ctl->rc_next = rctl;
1007 
1008 			return;
1009 		}
1010 	}
1011 
1012 	rctl->rc_next = next_ctl;
1013 	prev_ctl->rc_next = rctl;
1014 }
1015 
1016 /*
1017  * rctl_set_t *rctl_set_create()
1018  *
1019  * Overview
1020  *   Create an empty resource control set, suitable for attaching to a
1021  *   controlled entity.
1022  *
1023  * Return values
1024  *   A pointer to the newly created set.
1025  *
1026  * Caller's context
1027  *   Safe for KM_SLEEP allocations.
1028  */
1029 rctl_set_t *
1030 rctl_set_create()
1031 {
1032 	rctl_set_t *rset = kmem_zalloc(sizeof (rctl_set_t), KM_SLEEP);
1033 
1034 	mutex_init(&rset->rcs_lock, NULL, MUTEX_DEFAULT, NULL);
1035 	rset->rcs_ctls = kmem_zalloc(rctl_set_size * sizeof (rctl_t *),
1036 	    KM_SLEEP);
1037 	rset->rcs_entity = -1;
1038 
1039 	return (rset);
1040 }
1041 
1042 /*
1043  * rctl_gp_alloc_t *rctl_set_init_prealloc(rctl_entity_t)
1044  *
1045  * Overview
1046  *    rctl_set_init_prealloc() examines the globally defined resource controls
1047  *    and their default values and returns a resource control allocation group
1048  *    populated with sufficient controls and values to form a representative
1049  *    resource control set for the specified entity.
1050  *
1051  * Return values
1052  *    A pointer to the newly created allocation group.
1053  *
1054  * Caller's context
1055  *    Caller must be in a context suitable for KM_SLEEP allocations.
1056  */
1057 rctl_alloc_gp_t *
1058 rctl_set_init_prealloc(rctl_entity_t entity)
1059 {
1060 	rctl_dict_entry_t *rde;
1061 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1062 
1063 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1064 
1065 	if (rctl_lists[entity] == NULL)
1066 		return (ragp);
1067 
1068 	mutex_enter(&rctl_lists_lock);
1069 
1070 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1071 		ragp->rcag_nctls++;
1072 		ragp->rcag_nvals += rctl_val_list_count(rde->rcd_default_value);
1073 	}
1074 
1075 	mutex_exit(&rctl_lists_lock);
1076 
1077 	rctl_gp_alloc(ragp);
1078 
1079 	return (ragp);
1080 }
1081 
1082 /*
1083  * rctl_set_t *rctl_set_init(rctl_entity_t)
1084  *
1085  * Overview
1086  *   rctl_set_create() creates a resource control set, initialized with the
1087  *   system infinite values on all registered controls, for attachment to a
1088  *   system entity requiring resource controls, such as a process or a task.
1089  *
1090  * Return values
1091  *   A pointer to the newly filled set.
1092  *
1093  * Caller's context
1094  *   Caller must be holding p_lock on entry so that RCTLOP_SET() functions
1095  *   may modify task and project members based on the proc structure
1096  *   they are passed.
1097  */
1098 rctl_set_t *
1099 rctl_set_init(rctl_entity_t entity, struct proc *p, rctl_entity_p_t *e,
1100     rctl_set_t *rset, rctl_alloc_gp_t *ragp)
1101 {
1102 	rctl_dict_entry_t *rde;
1103 
1104 	ASSERT(MUTEX_HELD(&p->p_lock));
1105 	ASSERT(e);
1106 	rset->rcs_entity = entity;
1107 
1108 	if (rctl_lists[entity] == NULL)
1109 		return (rset);
1110 
1111 	mutex_enter(&rctl_lists_lock);
1112 	mutex_enter(&rset->rcs_lock);
1113 
1114 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1115 		rctl_t *rctl = rctl_gp_detach_ctl(ragp);
1116 
1117 		rctl->rc_dict_entry = rde;
1118 		rctl->rc_id = rde->rcd_id;
1119 		rctl->rc_projdb = NULL;
1120 
1121 		rctl->rc_values = rctl_val_list_dup(rde->rcd_default_value,
1122 		    ragp, NULL, p);
1123 		rctl->rc_cursor = rctl->rc_values;
1124 
1125 		ASSERT(rctl->rc_cursor != NULL);
1126 
1127 		rctl_set_insert(rset, rde->rcd_id, rctl);
1128 
1129 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1130 		    rctl->rc_cursor->rcv_value));
1131 	}
1132 
1133 	mutex_exit(&rset->rcs_lock);
1134 	mutex_exit(&rctl_lists_lock);
1135 
1136 	return (rset);
1137 }
1138 
1139 static rctl_t *
1140 rctl_dup(rctl_t *rctl, rctl_alloc_gp_t *ragp, struct proc *oldp,
1141     struct proc *newp)
1142 {
1143 	rctl_t *dup = rctl_gp_detach_ctl(ragp);
1144 	rctl_val_t *dval;
1145 
1146 	dup->rc_id = rctl->rc_id;
1147 	dup->rc_dict_entry = rctl->rc_dict_entry;
1148 	dup->rc_next = NULL;
1149 	dup->rc_cursor = NULL;
1150 	dup->rc_values = rctl_val_list_dup(rctl->rc_values, ragp, oldp, newp);
1151 
1152 	for (dval = dup->rc_values;
1153 	    dval != NULL; dval = dval->rcv_next) {
1154 		if (rctl_val_cmp(rctl->rc_cursor, dval, 0) >= 0) {
1155 			dup->rc_cursor = dval;
1156 			break;
1157 		}
1158 	}
1159 
1160 	if (dup->rc_cursor == NULL)
1161 		dup->rc_cursor = dup->rc_values;
1162 
1163 	return (dup);
1164 }
1165 
1166 static void
1167 rctl_set_fill_alloc_gp(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1168 {
1169 	uint_t i;
1170 
1171 	bzero(ragp, sizeof (rctl_alloc_gp_t));
1172 
1173 	for (i = 0; i < rctl_set_size; i++) {
1174 		rctl_t *r = set->rcs_ctls[i];
1175 
1176 		while (r != NULL) {
1177 			ragp->rcag_nctls++;
1178 
1179 			ragp->rcag_nvals += rctl_val_list_count(r->rc_values);
1180 
1181 			r = r->rc_next;
1182 		}
1183 	}
1184 }
1185 
1186 /*
1187  * rctl_alloc_gp_t *rctl_set_dup_prealloc(rctl_set_t *)
1188  *
1189  * Overview
1190  *   Given a resource control set, allocate a sufficiently large allocation
1191  *   group to contain a duplicate of the set.
1192  *
1193  * Return value
1194  *   A pointer to the newly created allocation group.
1195  *
1196  * Caller's context
1197  *   Safe for KM_SLEEP allocations.
1198  */
1199 rctl_alloc_gp_t *
1200 rctl_set_dup_prealloc(rctl_set_t *set)
1201 {
1202 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1203 
1204 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1205 
1206 	mutex_enter(&set->rcs_lock);
1207 	rctl_set_fill_alloc_gp(set, ragp);
1208 	mutex_exit(&set->rcs_lock);
1209 
1210 	rctl_gp_alloc(ragp);
1211 
1212 	return (ragp);
1213 }
1214 
1215 /*
1216  * int rctl_set_dup_ready(rctl_set_t *, rctl_alloc_gp_t *)
1217  *
1218  * Overview
1219  *   Verify that the allocation group provided is large enough to allow a
1220  *   duplicate of the given resource control set to be constructed from its
1221  *   contents.
1222  *
1223  * Return values
1224  *   1 if the allocation group is sufficiently large, 0 otherwise.
1225  *
1226  * Caller's context
1227  *   rcs_lock must be held prior to entry.
1228  */
1229 int
1230 rctl_set_dup_ready(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1231 {
1232 	rctl_alloc_gp_t curr_gp;
1233 
1234 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1235 
1236 	rctl_set_fill_alloc_gp(set, &curr_gp);
1237 
1238 	if (curr_gp.rcag_nctls <= ragp->rcag_nctls &&
1239 	    curr_gp.rcag_nvals <= ragp->rcag_nvals)
1240 		return (1);
1241 
1242 	return (0);
1243 }
1244 
1245 /*
1246  * rctl_set_t *rctl_set_dup(rctl_set_t *, struct proc *, struct proc *,
1247  *   rctl_set_t *, rctl_alloc_gp_t *, int)
1248  *
1249  * Overview
1250  *   Make a duplicate of the resource control set.  The proc pointers are those
1251  *   of the owning process and of the process associated with the entity
1252  *   receiving the duplicate.
1253  *
1254  *   Duplication is a 3 stage process. Stage 1 is memory allocation for
1255  *   the duplicate set, which is taken care of by rctl_set_dup_prealloc().
1256  *   Stage 2 consists of copying all rctls and values from the old set into
1257  *   the new. Stage 3 completes the duplication by performing the appropriate
1258  *   callbacks for each rctl in the new set.
1259  *
1260  *   Stages 2 and 3 are handled by calling rctl_set_dup with the RCD_DUP and
1261  *   RCD_CALLBACK functions, respectively. The RCD_CALLBACK flag may only
1262  *   be supplied if the newp proc structure reflects the new task and
1263  *   project linkage.
1264  *
1265  * Return value
1266  *   A pointer to the duplicate set.
1267  *
1268  * Caller's context
1269  *   The rcs_lock of the set to be duplicated must be held prior to entry.
1270  */
1271 rctl_set_t *
1272 rctl_set_dup(rctl_set_t *set, struct proc *oldp, struct proc *newp,
1273     rctl_entity_p_t *e, rctl_set_t *dup, rctl_alloc_gp_t *ragp, int flag)
1274 {
1275 	uint_t i;
1276 	rctl_set_t	*iter;
1277 
1278 	ASSERT((flag & RCD_DUP) || (flag & RCD_CALLBACK));
1279 	ASSERT(e);
1280 	/*
1281 	 * When copying the old set, iterate over that. Otherwise, when
1282 	 * only callbacks have been requested, iterate over the dup set.
1283 	 */
1284 	if (flag & RCD_DUP) {
1285 		ASSERT(MUTEX_HELD(&set->rcs_lock));
1286 		iter = set;
1287 		dup->rcs_entity = set->rcs_entity;
1288 	} else {
1289 		iter = dup;
1290 	}
1291 
1292 	mutex_enter(&dup->rcs_lock);
1293 
1294 	for (i = 0; i < rctl_set_size; i++) {
1295 		rctl_t *r = iter->rcs_ctls[i];
1296 		rctl_t *d;
1297 
1298 		while (r != NULL) {
1299 			if (flag & RCD_DUP) {
1300 				d = rctl_dup(r, ragp, oldp, newp);
1301 				rctl_set_insert(dup, r->rc_id, d);
1302 			} else {
1303 				d = r;
1304 			}
1305 
1306 			if (flag & RCD_CALLBACK)
1307 				RCTLOP_SET(d, newp, e,
1308 				    rctl_model_value(d->rc_dict_entry, newp,
1309 				    d->rc_cursor->rcv_value));
1310 
1311 			r = r->rc_next;
1312 		}
1313 	}
1314 
1315 	mutex_exit(&dup->rcs_lock);
1316 
1317 	return (dup);
1318 }
1319 
1320 /*
1321  * void rctl_set_free(rctl_set_t *)
1322  *
1323  * Overview
1324  *   Delete resource control set and all attached values.
1325  *
1326  * Return values
1327  *   No value returned.
1328  *
1329  * Caller's context
1330  *   No restrictions on context.
1331  */
1332 void
1333 rctl_set_free(rctl_set_t *set)
1334 {
1335 	uint_t i;
1336 
1337 	mutex_enter(&set->rcs_lock);
1338 	for (i = 0; i < rctl_set_size; i++) {
1339 		rctl_t *r = set->rcs_ctls[i];
1340 
1341 		while (r != NULL) {
1342 			rctl_val_t *v = r->rc_values;
1343 			rctl_t *n = r->rc_next;
1344 
1345 			kmem_cache_free(rctl_cache, r);
1346 
1347 			rctl_val_list_free(v);
1348 
1349 			r = n;
1350 		}
1351 	}
1352 	mutex_exit(&set->rcs_lock);
1353 
1354 	kmem_free(set->rcs_ctls, sizeof (rctl_t *) * rctl_set_size);
1355 	kmem_free(set, sizeof (rctl_set_t));
1356 }
1357 
1358 /*
1359  * void rctl_set_reset(rctl_set_t *)
1360  *
1361  * Overview
1362  *   Resets all rctls within the set such that the lowest value becomes active.
1363  *
1364  * Return values
1365  *   No value returned.
1366  *
1367  * Caller's context
1368  *   No restrictions on context.
1369  */
1370 void
1371 rctl_set_reset(rctl_set_t *set, struct proc *p, rctl_entity_p_t *e)
1372 {
1373 	uint_t i;
1374 
1375 	ASSERT(e);
1376 
1377 	mutex_enter(&set->rcs_lock);
1378 	for (i = 0; i < rctl_set_size; i++) {
1379 		rctl_t *r = set->rcs_ctls[i];
1380 
1381 		while (r != NULL) {
1382 			r->rc_cursor = r->rc_values;
1383 			rctl_val_list_reset(r->rc_cursor);
1384 			RCTLOP_SET(r, p, e, rctl_model_value(r->rc_dict_entry,
1385 			    p, r->rc_cursor->rcv_value));
1386 
1387 			ASSERT(r->rc_cursor != NULL);
1388 
1389 			r = r->rc_next;
1390 		}
1391 	}
1392 
1393 	mutex_exit(&set->rcs_lock);
1394 }
1395 
1396 /*
1397  * void rctl_set_tearoff(rctl_set *, struct proc *)
1398  *
1399  * Overview
1400  *   Tear off any resource control values on this set with an action recipient
1401  *   equal to the specified process (as they are becoming invalid with the
1402  *   process's departure from this set as an observer).
1403  *
1404  * Return values
1405  *   No value returned.
1406  *
1407  * Caller's context
1408  *   No restrictions on context
1409  */
1410 void
1411 rctl_set_tearoff(rctl_set_t *set, struct proc *p)
1412 {
1413 	uint_t i;
1414 
1415 	mutex_enter(&set->rcs_lock);
1416 	for (i = 0; i < rctl_set_size; i++) {
1417 		rctl_t *r = set->rcs_ctls[i];
1418 
1419 		while (r != NULL) {
1420 			rctl_val_t *rval;
1421 
1422 tearoff_rewalk_list:
1423 			rval = r->rc_values;
1424 
1425 			while (rval != NULL) {
1426 				if (rval->rcv_privilege == RCPRIV_BASIC &&
1427 				    rval->rcv_action_recipient == p) {
1428 					if (r->rc_cursor == rval)
1429 						r->rc_cursor = rval->rcv_next;
1430 
1431 					(void) rctl_val_list_delete(
1432 					    &r->rc_values, rval);
1433 
1434 					goto tearoff_rewalk_list;
1435 				}
1436 
1437 				rval = rval->rcv_next;
1438 			}
1439 
1440 			ASSERT(r->rc_cursor != NULL);
1441 
1442 			r = r->rc_next;
1443 		}
1444 	}
1445 
1446 	mutex_exit(&set->rcs_lock);
1447 }
1448 
1449 static int
1450 rctl_set_find(rctl_set_t *set, rctl_hndl_t hndl, rctl_t **rctl)
1451 {
1452 	uint_t index = hndl % rctl_set_size;
1453 	rctl_t *curr_ctl;
1454 
1455 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1456 
1457 	for (curr_ctl = set->rcs_ctls[index]; curr_ctl != NULL;
1458 	    curr_ctl = curr_ctl->rc_next) {
1459 		if (curr_ctl->rc_id == hndl) {
1460 			*rctl = curr_ctl;
1461 
1462 			return (0);
1463 		}
1464 	}
1465 
1466 	return (-1);
1467 }
1468 
1469 /*
1470  * rlim64_t rctl_enforced_value(rctl_hndl_t, rctl_set_t *, struct proc *)
1471  *
1472  * Overview
1473  *   Given a process, get the next enforced value on the rctl of the specified
1474  *   handle.
1475  *
1476  * Return value
1477  *   The enforced value.
1478  *
1479  * Caller's context
1480  *   For controls on process collectives, p->p_lock must be held across the
1481  *   operation.
1482  */
1483 /*ARGSUSED*/
1484 rctl_qty_t
1485 rctl_enforced_value(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p)
1486 {
1487 	rctl_t *rctl;
1488 	rlim64_t ret;
1489 
1490 	mutex_enter(&rset->rcs_lock);
1491 
1492 	if (rctl_set_find(rset, hndl, &rctl) == -1)
1493 		panic("unknown resource control handle %d requested", hndl);
1494 	else
1495 		ret = rctl_model_value(rctl->rc_dict_entry, p,
1496 		    rctl->rc_cursor->rcv_value);
1497 
1498 	mutex_exit(&rset->rcs_lock);
1499 
1500 	return (ret);
1501 }
1502 
1503 /*
1504  * int rctl_global_get(const char *, rctl_dict_entry_t *)
1505  *
1506  * Overview
1507  *   Copy a sanitized version of the global rctl for a given resource control
1508  *   name.  (By sanitization, we mean that the unsafe data pointers have been
1509  *   zeroed.)
1510  *
1511  * Return value
1512  *   -1 if name not defined, 0 otherwise.
1513  *
1514  * Caller's context
1515  *   No restrictions on context.  rctl_dict_lock must not be held.
1516  */
1517 int
1518 rctl_global_get(const char *name, rctl_dict_entry_t *drde)
1519 {
1520 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1521 
1522 	if (rde == NULL)
1523 		return (-1);
1524 
1525 	bcopy(rde, drde, sizeof (rctl_dict_entry_t));
1526 
1527 	drde->rcd_next = NULL;
1528 	drde->rcd_ops = NULL;
1529 
1530 	return (0);
1531 }
1532 
1533 /*
1534  * int rctl_global_set(const char *, rctl_dict_entry_t *)
1535  *
1536  * Overview
1537  *   Transfer the settable fields of the named rctl to the global rctl matching
1538  *   the given resource control name.
1539  *
1540  * Return value
1541  *   -1 if name not defined, 0 otherwise.
1542  *
1543  * Caller's context
1544  *   No restrictions on context.  rctl_dict_lock must not be held.
1545  */
1546 int
1547 rctl_global_set(const char *name, rctl_dict_entry_t *drde)
1548 {
1549 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1550 
1551 	if (rde == NULL)
1552 		return (-1);
1553 
1554 	rde->rcd_flagaction = drde->rcd_flagaction;
1555 	rde->rcd_syslog_level = drde->rcd_syslog_level;
1556 	rde->rcd_strlog_flags = drde->rcd_strlog_flags;
1557 
1558 	return (0);
1559 }
1560 
1561 static int
1562 rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1563     int (*cbop)(rctl_hndl_t, struct proc *p, rctl_entity_p_t *e, rctl_t *,
1564     rctl_val_t *, rctl_val_t *), struct proc *p)
1565 {
1566 	rctl_t *rctl;
1567 	rctl_set_t *rset;
1568 	rctl_entity_p_t e;
1569 	int ret = 0;
1570 	rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
1571 
1572 local_op_retry:
1573 
1574 	ASSERT(MUTEX_HELD(&p->p_lock));
1575 
1576 	rset = rctl_entity_obtain_rset(rde, p);
1577 
1578 	if (rset == NULL) {
1579 		return (-1);
1580 	}
1581 	rctl_entity_obtain_entity_p(rset->rcs_entity, p, &e);
1582 
1583 	mutex_enter(&rset->rcs_lock);
1584 
1585 	/* using rctl's hndl, get rctl from local set */
1586 	if (rctl_set_find(rset, hndl, &rctl) == -1) {
1587 		mutex_exit(&rset->rcs_lock);
1588 		return (-1);
1589 	}
1590 
1591 	ret = cbop(hndl, p, &e, rctl, oval, nval);
1592 
1593 	mutex_exit(&rset->rcs_lock);
1594 	return (ret);
1595 }
1596 
1597 /*ARGSUSED*/
1598 static int
1599 rctl_local_get_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1600     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1601 {
1602 	if (oval == NULL) {
1603 		/*
1604 		 * RCTL_FIRST
1605 		 */
1606 		bcopy(rctl->rc_values, nval, sizeof (rctl_val_t));
1607 	} else {
1608 		/*
1609 		 * RCTL_NEXT
1610 		 */
1611 		rctl_val_t *tval = rctl_val_list_find(&rctl->rc_values, oval);
1612 
1613 		if (tval == NULL)
1614 			return (ESRCH);
1615 		else if (tval->rcv_next == NULL)
1616 			return (ENOENT);
1617 		else
1618 			bcopy(tval->rcv_next, nval, sizeof (rctl_val_t));
1619 	}
1620 
1621 	return (0);
1622 }
1623 
1624 /*
1625  * int rctl_local_get(rctl_hndl_t, rctl_val_t *)
1626  *
1627  * Overview
1628  *   Get the rctl value for the given flags.
1629  *
1630  * Return values
1631  *   0 for successful get, errno otherwise.
1632  */
1633 int
1634 rctl_local_get(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1635     struct proc *p)
1636 {
1637 	return (rctl_local_op(hndl, oval, nval, rctl_local_get_cb, p));
1638 }
1639 
1640 /*ARGSUSED*/
1641 static int
1642 rctl_local_delete_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1643     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1644 {
1645 	if ((oval = rctl_val_list_find(&rctl->rc_values, nval)) == NULL)
1646 		return (ESRCH);
1647 
1648 	if (rctl->rc_cursor == oval) {
1649 		rctl->rc_cursor = oval->rcv_next;
1650 		rctl_val_list_reset(rctl->rc_cursor);
1651 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1652 		    rctl->rc_cursor->rcv_value));
1653 
1654 		ASSERT(rctl->rc_cursor != NULL);
1655 	}
1656 
1657 	(void) rctl_val_list_delete(&rctl->rc_values, oval);
1658 
1659 	return (0);
1660 }
1661 
1662 /*
1663  * int rctl_local_delete(rctl_hndl_t, rctl_val_t *)
1664  *
1665  * Overview
1666  *   Delete the rctl value for the given flags.
1667  *
1668  * Return values
1669  *   0 for successful delete, errno otherwise.
1670  */
1671 int
1672 rctl_local_delete(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1673 {
1674 	return (rctl_local_op(hndl, NULL, val, rctl_local_delete_cb, p));
1675 }
1676 
1677 /*
1678  * rctl_local_insert_cb()
1679  *
1680  * Overview
1681  *   Insert a new value into the rctl's val list. If an error occurs,
1682  *   the val list must be left in the same state as when the function
1683  *   was entered.
1684  *
1685  * Return Values
1686  *   0 for successful insert, EINVAL if the value is duplicated in the
1687  *   existing list.
1688  */
1689 /*ARGSUSED*/
1690 static int
1691 rctl_local_insert_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1692     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1693 {
1694 	/*
1695 	 * Before inserting, confirm there are no duplicates of this value
1696 	 * and flag level. If there is a duplicate, flag an error and do
1697 	 * nothing.
1698 	 */
1699 	if (rctl_val_list_insert(&rctl->rc_values, nval) != 0)
1700 		return (EINVAL);
1701 
1702 	if (rctl_val_cmp(nval, rctl->rc_cursor, 0) < 0) {
1703 		rctl->rc_cursor = nval;
1704 		rctl_val_list_reset(rctl->rc_cursor);
1705 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1706 		    rctl->rc_cursor->rcv_value));
1707 
1708 		ASSERT(rctl->rc_cursor != NULL);
1709 	}
1710 
1711 	return (0);
1712 }
1713 
1714 /*
1715  * int rctl_local_insert(rctl_hndl_t, rctl_val_t *)
1716  *
1717  * Overview
1718  *   Insert the rctl value into the appropriate rctl set for the calling
1719  *   process, given the handle.
1720  */
1721 int
1722 rctl_local_insert(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1723 {
1724 	return (rctl_local_op(hndl, NULL, val, rctl_local_insert_cb, p));
1725 }
1726 
1727 /*
1728  * rctl_local_insert_all_cb()
1729  *
1730  * Overview
1731  *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1732  *
1733  *   Inserts new values from the project database (new_values).  alloc_values
1734  *   should be a linked list of pre-allocated rctl_val_t, which are used to
1735  *   populate (rc_projdb).
1736  *
1737  *   Should the *new_values linked list match the contents of the rctl's
1738  *   rp_projdb then we do nothing.
1739  *
1740  * Return Values
1741  *   0 is always returned.
1742  */
1743 /*ARGSUSED*/
1744 static int
1745 rctl_local_insert_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1746     rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1747 {
1748 	rctl_val_t *val;
1749 	rctl_val_t *tmp_val;
1750 	rctl_val_t *next;
1751 	int modified = 0;
1752 
1753 	/*
1754 	 * If this the first time we've set this project rctl, then we delete
1755 	 * all the privilege values.  These privilege values have been set by
1756 	 * rctl_add_default_limit().
1757 	 *
1758 	 * We save some cycles here by not calling rctl_val_list_delete().
1759 	 */
1760 	if (rctl->rc_projdb == NULL) {
1761 		val = rctl->rc_values;
1762 
1763 		while (val != NULL) {
1764 			if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1765 				if (val->rcv_prev != NULL)
1766 					val->rcv_prev->rcv_next = val->rcv_next;
1767 				else
1768 					rctl->rc_values = val->rcv_next;
1769 
1770 				if (val->rcv_next != NULL)
1771 					val->rcv_next->rcv_prev = val->rcv_prev;
1772 
1773 				tmp_val = val;
1774 				val = val->rcv_next;
1775 				kmem_cache_free(rctl_val_cache, tmp_val);
1776 			} else {
1777 				val = val->rcv_next;
1778 			}
1779 		}
1780 		modified = 1;
1781 	}
1782 
1783 	/*
1784 	 * Delete active values previously set through the project database.
1785 	 */
1786 	val = rctl->rc_projdb;
1787 
1788 	while (val != NULL) {
1789 
1790 		/* Is the old value found in the new values? */
1791 		if (rctl_val_list_find(&new_values, val) == NULL) {
1792 
1793 			/*
1794 			 * Delete from the active values if it originated from
1795 			 * the project database.
1796 			 */
1797 			if (((tmp_val = rctl_val_list_find(&rctl->rc_values,
1798 			    val)) != NULL) &&
1799 			    (tmp_val->rcv_flagaction & RCTL_LOCAL_PROJDB)) {
1800 				(void) rctl_val_list_delete(&rctl->rc_values,
1801 				    tmp_val);
1802 			}
1803 
1804 			tmp_val = val->rcv_next;
1805 			(void) rctl_val_list_delete(&rctl->rc_projdb, val);
1806 			val = tmp_val;
1807 			modified = 1;
1808 
1809 		} else
1810 			val = val->rcv_next;
1811 	}
1812 
1813 	/*
1814 	 * Insert new values from the project database.
1815 	 */
1816 	while (new_values != NULL) {
1817 		next = new_values->rcv_next;
1818 
1819 		/*
1820 		 * Insert this new value into the rc_projdb, and duplicate this
1821 		 * entry to the active list.
1822 		 */
1823 		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1824 
1825 			tmp_val = alloc_values->rcv_next;
1826 			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1827 			alloc_values->rcv_next = tmp_val;
1828 
1829 			if (rctl_val_list_insert(&rctl->rc_values,
1830 				alloc_values) == 0) {
1831 				/* inserted move alloc_values on */
1832 				alloc_values = tmp_val;
1833 				modified = 1;
1834 			}
1835 		} else {
1836 			/*
1837 			 * Unlike setrctl() we don't want to return an error on
1838 			 * a duplicate entry; we are concerned solely with
1839 			 * ensuring that all the values specified are set.
1840 			 */
1841 			kmem_cache_free(rctl_val_cache, new_values);
1842 		}
1843 		new_values = next;
1844 	}
1845 
1846 	/* Teardown any unused rctl_val_t */
1847 	while (alloc_values != NULL) {
1848 		tmp_val = alloc_values;
1849 		alloc_values = alloc_values->rcv_next;
1850 		kmem_cache_free(rctl_val_cache, tmp_val);
1851 	}
1852 
1853 	/* Reset the cursor if rctl values have been modified */
1854 	if (modified) {
1855 		rctl->rc_cursor = rctl->rc_values;
1856 		rctl_val_list_reset(rctl->rc_cursor);
1857 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1858 		    rctl->rc_cursor->rcv_value));
1859 	}
1860 
1861 	return (0);
1862 }
1863 
1864 int
1865 rctl_local_insert_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1866     rctl_val_t *alloc_values, struct proc *p)
1867 {
1868 	return (rctl_local_op(hndl, new_values, alloc_values,
1869 	    rctl_local_insert_all_cb, p));
1870 }
1871 
1872 /*
1873  * rctl_local_replace_all_cb()
1874  *
1875  * Overview
1876  *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1877  *
1878  *   Clears the active rctl values (rc_values), and stored values from the
1879  *   previous insertions from the project database (rc_projdb).
1880  *
1881  *   Inserts new values from the project database (new_values).  alloc_values
1882  *   should be a linked list of pre-allocated rctl_val_t, which are used to
1883  *   populate (rc_projdb).
1884  *
1885  * Return Values
1886  *   0 is always returned.
1887  */
1888 /*ARGSUSED*/
1889 static int
1890 rctl_local_replace_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1891     rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1892 {
1893 	rctl_val_t *val;
1894 	rctl_val_t *next;
1895 	rctl_val_t *tmp_val;
1896 
1897 	/* Delete all the privilege vaules */
1898 	val = rctl->rc_values;
1899 
1900 	while (val != NULL) {
1901 		if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1902 			if (val->rcv_prev != NULL)
1903 				val->rcv_prev->rcv_next = val->rcv_next;
1904 			else
1905 				rctl->rc_values = val->rcv_next;
1906 
1907 			if (val->rcv_next != NULL)
1908 				val->rcv_next->rcv_prev = val->rcv_prev;
1909 
1910 			tmp_val = val;
1911 			val = val->rcv_next;
1912 			kmem_cache_free(rctl_val_cache, tmp_val);
1913 		} else {
1914 			val = val->rcv_next;
1915 		}
1916 	}
1917 
1918 	/* Delete the contents of rc_projdb */
1919 	val = rctl->rc_projdb;
1920 	while (val != NULL) {
1921 
1922 		tmp_val = val;
1923 		val = val->rcv_next;
1924 		kmem_cache_free(rctl_val_cache, tmp_val);
1925 	}
1926 	rctl->rc_projdb = NULL;
1927 
1928 	/*
1929 	 * Insert new values from the project database.
1930 	 */
1931 	while (new_values != NULL) {
1932 		next = new_values->rcv_next;
1933 
1934 		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1935 			tmp_val = alloc_values->rcv_next;
1936 			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1937 			alloc_values->rcv_next = tmp_val;
1938 
1939 			if (rctl_val_list_insert(&rctl->rc_values,
1940 				alloc_values) == 0) {
1941 				/* inserted, so move alloc_values on */
1942 				alloc_values = tmp_val;
1943 			}
1944 		} else {
1945 			/*
1946 			 * Unlike setrctl() we don't want to return an error on
1947 			 * a duplicate entry; we are concerned solely with
1948 			 * ensuring that all the values specified are set.
1949 			 */
1950 			kmem_cache_free(rctl_val_cache, new_values);
1951 		}
1952 
1953 		new_values = next;
1954 	}
1955 
1956 	/* Teardown any unused rctl_val_t */
1957 	while (alloc_values != NULL) {
1958 		tmp_val = alloc_values;
1959 		alloc_values = alloc_values->rcv_next;
1960 		kmem_cache_free(rctl_val_cache, tmp_val);
1961 	}
1962 
1963 	/* Always reset the cursor */
1964 	rctl->rc_cursor = rctl->rc_values;
1965 	rctl_val_list_reset(rctl->rc_cursor);
1966 	RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1967 	    rctl->rc_cursor->rcv_value));
1968 
1969 	return (0);
1970 }
1971 
1972 int
1973 rctl_local_replace_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1974     rctl_val_t *alloc_values, struct proc *p)
1975 {
1976 	return (rctl_local_op(hndl, new_values, alloc_values,
1977 	    rctl_local_replace_all_cb, p));
1978 }
1979 
1980 static int
1981 rctl_local_replace_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1982     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1983 {
1984 	int ret;
1985 	rctl_val_t *tmp;
1986 
1987 	/* Verify that old will be delete-able */
1988 	tmp = rctl_val_list_find(&rctl->rc_values, oval);
1989 	if (tmp == NULL)
1990 		return (ESRCH);
1991 	/*
1992 	 * Caller should verify that value being deleted is not the
1993 	 * system value.
1994 	 */
1995 	ASSERT(tmp->rcv_privilege != RCPRIV_SYSTEM);
1996 
1997 	/*
1998 	 * rctl_local_insert_cb() does the job of flagging an error
1999 	 * for any duplicate values. So, call rctl_local_insert_cb()
2000 	 * for the new value first, then do deletion of the old value.
2001 	 * Since this is a callback function to rctl_local_op, we can
2002 	 * count on rcs_lock being held at this point. This guarantees
2003 	 * that there is at no point a visible list which contains both
2004 	 * new and old values.
2005 	 */
2006 	if (ret = rctl_local_insert_cb(hndl, p, e, rctl, NULL, nval))
2007 		return (ret);
2008 
2009 	ret = rctl_local_delete_cb(hndl, p, e, rctl, NULL, oval);
2010 	ASSERT(ret == 0);
2011 	return (0);
2012 }
2013 
2014 /*
2015  * int rctl_local_replace(rctl_hndl_t, void *, int, uint64_t *)
2016  *
2017  * Overview
2018  *   Replace the rctl value with a new one.
2019  *
2020  * Return values
2021  *   0 for successful replace, errno otherwise.
2022  */
2023 int
2024 rctl_local_replace(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
2025     struct proc *p)
2026 {
2027 	return (rctl_local_op(hndl, oval, nval, rctl_local_replace_cb, p));
2028 }
2029 
2030 /*
2031  * int rctl_rlimit_get(rctl_hndl_t, struct proc *, struct rlimit64 *)
2032  *
2033  * Overview
2034  *   To support rlimit compatibility, we need a function which takes a 64-bit
2035  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2036  *   This operation is only intended for legacy rlimits.
2037  */
2038 int
2039 rctl_rlimit_get(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64)
2040 {
2041 	rctl_t *rctl;
2042 	rctl_val_t *rval;
2043 	rctl_set_t *rset = p->p_rctls;
2044 	int soft_limit_seen = 0;
2045 	int test_for_deny = 1;
2046 
2047 	mutex_enter(&rset->rcs_lock);
2048 	if (rctl_set_find(rset, rc, &rctl) == -1) {
2049 		mutex_exit(&rset->rcs_lock);
2050 		return (-1);
2051 	}
2052 
2053 	rval = rctl->rc_values;
2054 
2055 	if (rctl->rc_dict_entry->rcd_flagaction & (RCTL_GLOBAL_DENY_NEVER |
2056 	    RCTL_GLOBAL_DENY_ALWAYS))
2057 		test_for_deny = 0;
2058 
2059 	/*
2060 	 * 1.  Find the first control value with the RCTL_LOCAL_DENY bit set.
2061 	 */
2062 	while (rval != NULL && rval->rcv_privilege != RCPRIV_SYSTEM) {
2063 		if (test_for_deny &&
2064 		    (rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0) {
2065 			rval = rval->rcv_next;
2066 			continue;
2067 		}
2068 
2069 		/*
2070 		 * 2.  If this is an RCPRIV_BASIC value, then we've found the
2071 		 * effective soft limit and should set rlim_cur.  We should then
2072 		 * continue looking for another control value with the DENY bit
2073 		 * set.
2074 		 */
2075 		if (rval->rcv_privilege == RCPRIV_BASIC) {
2076 			if (soft_limit_seen) {
2077 				rval = rval->rcv_next;
2078 				continue;
2079 			}
2080 
2081 			if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2082 			    rval->rcv_value < rctl_model_maximum(
2083 			    rctl->rc_dict_entry, p))
2084 				rlp64->rlim_cur = rval->rcv_value;
2085 			else
2086 				rlp64->rlim_cur = RLIM64_INFINITY;
2087 			soft_limit_seen = 1;
2088 
2089 			rval = rval->rcv_next;
2090 			continue;
2091 		}
2092 
2093 		/*
2094 		 * 3.  This is an RCPRIV_PRIVILEGED value.  If we haven't found
2095 		 * a soft limit candidate, then we've found the effective hard
2096 		 * and soft limits and should set both  If we had found a soft
2097 		 * limit, then this is only the hard limit and we need only set
2098 		 * rlim_max.
2099 		 */
2100 		if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2101 		    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry,
2102 		    p))
2103 			rlp64->rlim_max = rval->rcv_value;
2104 		else
2105 			rlp64->rlim_max = RLIM64_INFINITY;
2106 		if (!soft_limit_seen)
2107 			rlp64->rlim_cur = rlp64->rlim_max;
2108 
2109 		mutex_exit(&rset->rcs_lock);
2110 		return (0);
2111 	}
2112 
2113 	if (rval == NULL) {
2114 		/*
2115 		 * This control sequence is corrupt, as it is not terminated by
2116 		 * a system privileged control value.
2117 		 */
2118 		mutex_exit(&rset->rcs_lock);
2119 		return (-1);
2120 	}
2121 
2122 	/*
2123 	 * 4.  If we run into a RCPRIV_SYSTEM value, then the hard limit (and
2124 	 * the soft, if we haven't a soft candidate) should be the value of the
2125 	 * system control value.
2126 	 */
2127 	if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2128 	    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry, p))
2129 		rlp64->rlim_max = rval->rcv_value;
2130 	else
2131 		rlp64->rlim_max = RLIM64_INFINITY;
2132 
2133 	if (!soft_limit_seen)
2134 		rlp64->rlim_cur = rlp64->rlim_max;
2135 
2136 	mutex_exit(&rset->rcs_lock);
2137 	return (0);
2138 }
2139 
2140 /*
2141  * rctl_alloc_gp_t *rctl_rlimit_set_prealloc(uint_t)
2142  *
2143  * Overview
2144  *   Before making a series of calls to rctl_rlimit_set(), we must have a
2145  *   preallocated batch of resource control values, as rctl_rlimit_set() can
2146  *   potentially consume two resource control values per call.
2147  *
2148  * Return values
2149  *   A populated resource control allocation group with 2n resource control
2150  *   values.
2151  *
2152  * Caller's context
2153  *   Must be safe for KM_SLEEP allocations.
2154  */
2155 rctl_alloc_gp_t *
2156 rctl_rlimit_set_prealloc(uint_t n)
2157 {
2158 	rctl_alloc_gp_t *gp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
2159 
2160 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
2161 
2162 	gp->rcag_nvals = 2 * n;
2163 
2164 	rctl_gp_alloc(gp);
2165 
2166 	return (gp);
2167 }
2168 
2169 /*
2170  * int rctl_rlimit_set(rctl_hndl_t, struct proc *, struct rlimit64 *, int,
2171  *   int)
2172  *
2173  * Overview
2174  *   To support rlimit compatibility, we need a function which takes a 64-bit
2175  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2176  *   This operation is only intended for legacy rlimits.
2177  *
2178  *   The implementation of rctl_rlimit_set() is a bit clever, as it tries to
2179  *   minimize the number of values placed on the value sequence in various
2180  *   cases.  Furthermore, we don't allow multiple identical privilege-action
2181  *   values on the same sequence.  (That is, we don't want a sequence like
2182  *   "while (1) { rlim.rlim_cur++; setrlimit(..., rlim); }" to exhaust kernel
2183  *   memory.)  So we want to delete any values with the same privilege value and
2184  *   action.
2185  *
2186  * Return values
2187  *   0 for successful set, errno otherwise. Errno will be either EINVAL
2188  *   or EPERM, in keeping with defined errnos for ulimit() and setrlimit()
2189  *   system calls.
2190  */
2191 /*ARGSUSED*/
2192 int
2193 rctl_rlimit_set(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64,
2194     rctl_alloc_gp_t *ragp, int flagaction, int signal, const cred_t *cr)
2195 {
2196 	rctl_t *rctl;
2197 	rctl_val_t *rval, *rval_priv, *rval_basic;
2198 	rctl_set_t *rset = p->p_rctls;
2199 	rctl_qty_t max;
2200 	rctl_entity_p_t e;
2201 	struct rlimit64 cur_rl;
2202 
2203 	e.rcep_t = RCENTITY_PROCESS;
2204 	e.rcep_p.proc = p;
2205 
2206 	if (rlp64->rlim_cur > rlp64->rlim_max)
2207 		return (EINVAL);
2208 
2209 	if (rctl_rlimit_get(rc, p, &cur_rl) == -1)
2210 		return (EINVAL);
2211 
2212 	/*
2213 	 * If we are not privileged, we can only lower the hard limit.
2214 	 */
2215 	if ((rlp64->rlim_max > cur_rl.rlim_max) &&
2216 	    cur_rl.rlim_max != RLIM64_INFINITY &&
2217 	    secpolicy_resource(cr) != 0)
2218 		return (EPERM);
2219 
2220 	mutex_enter(&rset->rcs_lock);
2221 
2222 	if (rctl_set_find(rset, rc, &rctl) == -1) {
2223 		mutex_exit(&rset->rcs_lock);
2224 		return (EINVAL);
2225 	}
2226 
2227 	rval_priv = rctl_gp_detach_val(ragp);
2228 
2229 	rval = rctl->rc_values;
2230 
2231 	while (rval != NULL) {
2232 		rctl_val_t *next = rval->rcv_next;
2233 
2234 		if (rval->rcv_privilege == RCPRIV_SYSTEM)
2235 			break;
2236 
2237 		if ((rval->rcv_privilege == RCPRIV_BASIC) ||
2238 		    (rval->rcv_flagaction & ~RCTL_LOCAL_ACTION_MASK) ==
2239 		    (flagaction & ~RCTL_LOCAL_ACTION_MASK)) {
2240 			if (rctl->rc_cursor == rval) {
2241 				rctl->rc_cursor = rval->rcv_next;
2242 				rctl_val_list_reset(rctl->rc_cursor);
2243 				RCTLOP_SET(rctl, p, &e, rctl_model_value(
2244 				    rctl->rc_dict_entry, p,
2245 				    rctl->rc_cursor->rcv_value));
2246 			}
2247 			(void) rctl_val_list_delete(&rctl->rc_values, rval);
2248 		}
2249 
2250 		rval = next;
2251 	}
2252 
2253 	rval_priv->rcv_privilege = RCPRIV_PRIVILEGED;
2254 	rval_priv->rcv_flagaction = flagaction;
2255 	if (rlp64->rlim_max == RLIM64_INFINITY) {
2256 		rval_priv->rcv_flagaction |= RCTL_LOCAL_MAXIMAL;
2257 		max = rctl->rc_dict_entry->rcd_max_native;
2258 	} else {
2259 		max = rlp64->rlim_max;
2260 	}
2261 	rval_priv->rcv_value = max;
2262 	rval_priv->rcv_action_signal = signal;
2263 	rval_priv->rcv_action_recipient = NULL;
2264 	rval_priv->rcv_action_recip_pid = -1;
2265 	rval_priv->rcv_firing_time = 0;
2266 	rval_priv->rcv_prev = rval_priv->rcv_next = NULL;
2267 
2268 	(void) rctl_val_list_insert(&rctl->rc_values, rval_priv);
2269 	rctl->rc_cursor = rval_priv;
2270 	rctl_val_list_reset(rctl->rc_cursor);
2271 	RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2272 	    rctl->rc_cursor->rcv_value));
2273 
2274 	if (rlp64->rlim_cur != RLIM64_INFINITY && rlp64->rlim_cur < max) {
2275 		rval_basic = rctl_gp_detach_val(ragp);
2276 
2277 		rval_basic->rcv_privilege = RCPRIV_BASIC;
2278 		rval_basic->rcv_value = rlp64->rlim_cur;
2279 		rval_basic->rcv_flagaction = flagaction;
2280 		rval_basic->rcv_action_signal = signal;
2281 		rval_basic->rcv_action_recipient = p;
2282 		rval_basic->rcv_action_recip_pid = p->p_pid;
2283 		rval_basic->rcv_firing_time = 0;
2284 		rval_basic->rcv_prev = rval_basic->rcv_next = NULL;
2285 
2286 		(void) rctl_val_list_insert(&rctl->rc_values, rval_basic);
2287 		rctl->rc_cursor = rval_basic;
2288 		rctl_val_list_reset(rctl->rc_cursor);
2289 		RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2290 		    rctl->rc_cursor->rcv_value));
2291 	}
2292 
2293 	ASSERT(rctl->rc_cursor != NULL);
2294 
2295 	mutex_exit(&rset->rcs_lock);
2296 	return (0);
2297 }
2298 
2299 
2300 /*
2301  * rctl_hndl_t rctl_register(const char *, rctl_entity_t, int, rlim64_t,
2302  *   rlim64_t, rctl_ops_t *)
2303  *
2304  * Overview
2305  *   rctl_register() performs a look-up in the dictionary of rctls
2306  *   active on the system; if a rctl of that name is absent, an entry is
2307  *   made into the dictionary.  The rctl is returned with its reference
2308  *   count incremented by one.  If the rctl name already exists, we panic.
2309  *   (Were the resource control system to support dynamic loading and unloading,
2310  *   which it is structured for, duplicate registration should lead to load
2311  *   failure instead of panicking.)
2312  *
2313  *   Each registered rctl has a requirement that a RCPRIV_SYSTEM limit be
2314  *   defined.  This limit contains the highest possible value for this quantity
2315  *   on the system.  Furthermore, the registered control must provide infinite
2316  *   values for all applicable address space models supported by the operating
2317  *   system.  Attempts to set resource control values beyond the system limit
2318  *   will fail.
2319  *
2320  * Return values
2321  *   The rctl's ID.
2322  *
2323  * Caller's context
2324  *   Caller must be in a context suitable for KM_SLEEP allocations.
2325  */
2326 rctl_hndl_t
2327 rctl_register(
2328     const char *name,
2329     rctl_entity_t entity,
2330     int global_flags,
2331     rlim64_t max_native,
2332     rlim64_t max_ilp32,
2333     rctl_ops_t *ops)
2334 {
2335 	rctl_t *rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
2336 	rctl_val_t *rctl_val = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2337 	rctl_dict_entry_t *rctl_de = kmem_zalloc(sizeof (rctl_dict_entry_t),
2338 	    KM_SLEEP);
2339 	rctl_t *old_rctl;
2340 	rctl_hndl_t rhndl;
2341 	int localflags;
2342 
2343 	ASSERT(ops != NULL);
2344 
2345 	bzero(rctl, sizeof (rctl_t));
2346 	bzero(rctl_val, sizeof (rctl_val_t));
2347 
2348 	if (global_flags & RCTL_GLOBAL_DENY_NEVER)
2349 		localflags = RCTL_LOCAL_MAXIMAL;
2350 	else
2351 		localflags = RCTL_LOCAL_MAXIMAL | RCTL_LOCAL_DENY;
2352 
2353 	rctl_val->rcv_privilege = RCPRIV_SYSTEM;
2354 	rctl_val->rcv_value = max_native;
2355 	rctl_val->rcv_flagaction = localflags;
2356 	rctl_val->rcv_action_signal = 0;
2357 	rctl_val->rcv_action_recipient = NULL;
2358 	rctl_val->rcv_action_recip_pid = -1;
2359 	rctl_val->rcv_firing_time = 0;
2360 	rctl_val->rcv_next = NULL;
2361 	rctl_val->rcv_prev = NULL;
2362 
2363 	rctl_de->rcd_name = (char *)name;
2364 	rctl_de->rcd_default_value = rctl_val;
2365 	rctl_de->rcd_max_native = max_native;
2366 	rctl_de->rcd_max_ilp32 = max_ilp32;
2367 	rctl_de->rcd_entity = entity;
2368 	rctl_de->rcd_ops = ops;
2369 	rctl_de->rcd_flagaction = global_flags;
2370 
2371 	rctl->rc_dict_entry = rctl_de;
2372 	rctl->rc_values = rctl_val;
2373 
2374 	/*
2375 	 * 1.  Take global lock, validate nonexistence of name, get ID.
2376 	 */
2377 	mutex_enter(&rctl_dict_lock);
2378 
2379 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
2380 	    (mod_hash_val_t *)&rhndl) != MH_ERR_NOTFOUND)
2381 		panic("duplicate registration of rctl %s", name);
2382 
2383 	rhndl = rctl_de->rcd_id = rctl->rc_id =
2384 	    (rctl_hndl_t)id_alloc(rctl_ids);
2385 
2386 	/*
2387 	 * 2.  Insert name-entry pair in rctl_dict_by_name.
2388 	 */
2389 	if (mod_hash_insert(rctl_dict_by_name, (mod_hash_key_t)name,
2390 	    (mod_hash_val_t)rctl_de))
2391 		panic("unable to insert rctl dict entry for %s (%u)", name,
2392 		    (uint_t)rctl->rc_id);
2393 
2394 	/*
2395 	 * 3.  Insert ID-rctl_t * pair in rctl_dict.
2396 	 */
2397 	if (mod_hash_find(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2398 	    (mod_hash_val_t *)&old_rctl) != MH_ERR_NOTFOUND)
2399 		panic("duplicate rctl ID %u registered", rctl->rc_id);
2400 
2401 	if (mod_hash_insert(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2402 	    (mod_hash_val_t)rctl))
2403 		panic("unable to insert rctl %s/%u (%p)", name,
2404 		    (uint_t)rctl->rc_id, rctl);
2405 
2406 	/*
2407 	 * 3a. Insert rctl_dict_entry_t * in appropriate entity list.
2408 	 */
2409 
2410 	mutex_enter(&rctl_lists_lock);
2411 
2412 	switch (entity) {
2413 	case RCENTITY_ZONE:
2414 	case RCENTITY_PROJECT:
2415 	case RCENTITY_TASK:
2416 	case RCENTITY_PROCESS:
2417 		rctl_de->rcd_next = rctl_lists[entity];
2418 		rctl_lists[entity] = rctl_de;
2419 		break;
2420 	default:
2421 		panic("registering unknown rctl entity %d (%s)", entity,
2422 		    name);
2423 		break;
2424 	}
2425 
2426 	mutex_exit(&rctl_lists_lock);
2427 
2428 	/*
2429 	 * 4.  Drop lock.
2430 	 */
2431 	mutex_exit(&rctl_dict_lock);
2432 
2433 	return (rhndl);
2434 }
2435 
2436 /*
2437  * static int rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p,
2438  *    rctl_val_t *v)
2439  *
2440  * Overview
2441  *   rctl_global_action() takes, in according with the flags on the rctl_dict
2442  *   entry for the given control, the appropriate actions on the exceeded
2443  *   control value.  Additionally, rctl_global_action() updates the firing time
2444  *   on the exceeded value.
2445  *
2446  * Return values
2447  *   A bitmask reflecting the actions actually taken.
2448  *
2449  * Caller's context
2450  *   No restrictions on context.
2451  */
2452 /*ARGSUSED*/
2453 static int
2454 rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v)
2455 {
2456 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2457 	const char *pr, *en, *idstr;
2458 	id_t id;
2459 	enum {
2460 		SUFFIX_NONE,	/* id consumed directly */
2461 		SUFFIX_NUMERIC,	/* id consumed in suffix */
2462 		SUFFIX_STRING	/* idstr consumed in suffix */
2463 	} suffix = SUFFIX_NONE;
2464 	int ret = 0;
2465 
2466 	v->rcv_firing_time = gethrtime();
2467 
2468 	switch (v->rcv_privilege) {
2469 	case RCPRIV_BASIC:
2470 		pr = "basic";
2471 		break;
2472 	case RCPRIV_PRIVILEGED:
2473 		pr = "privileged";
2474 		break;
2475 	case RCPRIV_SYSTEM:
2476 		pr = "system";
2477 		break;
2478 	default:
2479 		pr = "unknown";
2480 		break;
2481 	}
2482 
2483 	switch (rde->rcd_entity) {
2484 	case RCENTITY_PROCESS:
2485 		en = "process";
2486 		id = p->p_pid;
2487 		suffix = SUFFIX_NONE;
2488 		break;
2489 	case RCENTITY_TASK:
2490 		en = "task";
2491 		id = p->p_task->tk_tkid;
2492 		suffix = SUFFIX_NUMERIC;
2493 		break;
2494 	case RCENTITY_PROJECT:
2495 		en = "project";
2496 		id = p->p_task->tk_proj->kpj_id;
2497 		suffix = SUFFIX_NUMERIC;
2498 		break;
2499 	case RCENTITY_ZONE:
2500 		en = "zone";
2501 		idstr = p->p_zone->zone_name;
2502 		suffix = SUFFIX_STRING;
2503 		break;
2504 	default:
2505 		en = "unknown entity associated with process";
2506 		id = p->p_pid;
2507 		suffix = SUFFIX_NONE;
2508 		break;
2509 	}
2510 
2511 	if (rde->rcd_flagaction & RCTL_GLOBAL_SYSLOG) {
2512 		switch (suffix) {
2513 		default:
2514 		case SUFFIX_NONE:
2515 			(void) strlog(0, 0, 0,
2516 			    rde->rcd_strlog_flags | log_global.lz_active,
2517 			    "%s rctl %s (value %llu) exceeded by %s %d.",
2518 			    pr, rde->rcd_name, v->rcv_value, en, id);
2519 			break;
2520 		case SUFFIX_NUMERIC:
2521 			(void) strlog(0, 0, 0,
2522 			    rde->rcd_strlog_flags | log_global.lz_active,
2523 			    "%s rctl %s (value %llu) exceeded by process %d"
2524 			    " in %s %d.",
2525 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2526 			    en, id);
2527 			break;
2528 		case SUFFIX_STRING:
2529 			(void) strlog(0, 0, 0,
2530 			    rde->rcd_strlog_flags | log_global.lz_active,
2531 			    "%s rctl %s (value %llu) exceeded by process %d"
2532 			    " in %s %s.",
2533 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2534 			    en, idstr);
2535 			break;
2536 		}
2537 	}
2538 
2539 	if (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS)
2540 		ret |= RCT_DENY;
2541 
2542 	return (ret);
2543 }
2544 
2545 static int
2546 rctl_local_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v,
2547     uint_t safety)
2548 {
2549 	int ret = 0;
2550 	sigqueue_t *sqp = NULL;
2551 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2552 	int unobservable = (rde->rcd_flagaction & RCTL_GLOBAL_UNOBSERVABLE);
2553 
2554 	proc_t *recipient = v->rcv_action_recipient;
2555 	id_t recip_pid = v->rcv_action_recip_pid;
2556 	int recip_signal = v->rcv_action_signal;
2557 	uint_t flagaction = v->rcv_flagaction;
2558 
2559 	if (safety == RCA_UNSAFE_ALL) {
2560 		if (flagaction & RCTL_LOCAL_DENY) {
2561 			ret |= RCT_DENY;
2562 		}
2563 		return (ret);
2564 	}
2565 
2566 	if (flagaction & RCTL_LOCAL_SIGNAL) {
2567 		/*
2568 		 * We can build a siginfo only in the case that it is
2569 		 * safe for us to drop p_lock.  (For asynchronous
2570 		 * checks this is currently not true.)
2571 		 */
2572 		if (safety == RCA_SAFE) {
2573 			mutex_exit(&rset->rcs_lock);
2574 			mutex_exit(&p->p_lock);
2575 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
2576 			mutex_enter(&p->p_lock);
2577 			mutex_enter(&rset->rcs_lock);
2578 
2579 			sqp->sq_info.si_signo = recip_signal;
2580 			sqp->sq_info.si_code = SI_RCTL;
2581 			sqp->sq_info.si_errno = 0;
2582 			sqp->sq_info.si_entity = (int)rde->rcd_entity;
2583 		}
2584 
2585 		if (recipient == NULL || recipient == p) {
2586 			ret |= RCT_SIGNAL;
2587 
2588 			if (sqp == NULL) {
2589 				sigtoproc(p, NULL, recip_signal);
2590 			} else if (p == curproc) {
2591 				/*
2592 				 * Then this is a synchronous test and we can
2593 				 * direct the signal at the violating thread.
2594 				 */
2595 				sigaddqa(curproc, curthread, sqp);
2596 			} else {
2597 				sigaddqa(p, NULL, sqp);
2598 			}
2599 		} else if (!unobservable) {
2600 			proc_t *rp;
2601 
2602 			mutex_exit(&rset->rcs_lock);
2603 			mutex_exit(&p->p_lock);
2604 
2605 			mutex_enter(&pidlock);
2606 			if ((rp = prfind(recip_pid)) == recipient) {
2607 				/*
2608 				 * Recipient process is still alive, but may not
2609 				 * be in this task or project any longer.  In
2610 				 * this case, the recipient's resource control
2611 				 * set pertinent to this control will have
2612 				 * changed--and we will not deliver the signal,
2613 				 * as the recipient process is trying to tear
2614 				 * itself off of its former set.
2615 				 */
2616 				mutex_enter(&rp->p_lock);
2617 				mutex_exit(&pidlock);
2618 
2619 				if (rctl_entity_obtain_rset(rde, rp) == rset) {
2620 					ret |= RCT_SIGNAL;
2621 
2622 					if (sqp == NULL)
2623 						sigtoproc(rp, NULL,
2624 						    recip_signal);
2625 					else
2626 						sigaddqa(rp, NULL, sqp);
2627 				} else if (sqp) {
2628 					kmem_free(sqp, sizeof (sigqueue_t));
2629 				}
2630 				mutex_exit(&rp->p_lock);
2631 			} else {
2632 				mutex_exit(&pidlock);
2633 				if (sqp)
2634 					kmem_free(sqp, sizeof (sigqueue_t));
2635 			}
2636 
2637 			mutex_enter(&p->p_lock);
2638 			/*
2639 			 * Since we dropped p_lock, we may no longer be in the
2640 			 * same task or project as we were at entry.  It is thus
2641 			 * unsafe for us to reacquire the set lock at this
2642 			 * point; callers of rctl_local_action() must handle
2643 			 * this possibility.
2644 			 */
2645 			ret |= RCT_LK_ABANDONED;
2646 		} else if (sqp) {
2647 			kmem_free(sqp, sizeof (sigqueue_t));
2648 		}
2649 	}
2650 
2651 	if ((flagaction & RCTL_LOCAL_DENY) &&
2652 	    (recipient == NULL || recipient == p)) {
2653 		ret |= RCT_DENY;
2654 	}
2655 
2656 	return (ret);
2657 }
2658 
2659 /*
2660  * int rctl_action(rctl_hndl_t, rctl_set_t *, struct proc *, uint_t)
2661  *
2662  * Overview
2663  *   Take the action associated with the enforced value (as defined by
2664  *   rctl_get_enforced_value()) being exceeded or encountered.  Possibly perform
2665  *   a restricted subset of the available actions, if circumstances dictate that
2666  *   we cannot safely allocate memory (for a sigqueue_t) or guarantee process
2667  *   persistence across the duration of the function (an asynchronous action).
2668  *
2669  * Return values
2670  *   Actions taken, according to the rctl_test bitmask.
2671  *
2672  * Caller's context
2673  *   Safe to acquire rcs_lock.
2674  */
2675 int
2676 rctl_action(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p, uint_t safety)
2677 {
2678 	return (rctl_action_entity(hndl, rset, p, NULL, safety));
2679 }
2680 
2681 int
2682 rctl_action_entity(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p,
2683     rctl_entity_p_t *e, uint_t safety)
2684 {
2685 	int ret = RCT_NONE;
2686 	rctl_t *lrctl;
2687 	rctl_entity_p_t e_tmp;
2688 
2689 rctl_action_acquire:
2690 	mutex_enter(&rset->rcs_lock);
2691 	if (rctl_set_find(rset, hndl, &lrctl) == -1) {
2692 		mutex_exit(&rset->rcs_lock);
2693 		return (ret);
2694 	}
2695 
2696 	if (e == NULL) {
2697 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2698 		p, &e_tmp);
2699 		e = &e_tmp;
2700 	}
2701 
2702 	if ((ret & RCT_LK_ABANDONED) == 0) {
2703 		ret |= rctl_global_action(lrctl, rset, p, lrctl->rc_cursor);
2704 
2705 		RCTLOP_ACTION(lrctl, p, e);
2706 
2707 		ret |= rctl_local_action(lrctl, rset, p,
2708 		    lrctl->rc_cursor, safety);
2709 
2710 		if (ret & RCT_LK_ABANDONED)
2711 			goto rctl_action_acquire;
2712 	}
2713 
2714 	ret &= ~RCT_LK_ABANDONED;
2715 
2716 	if (!(ret & RCT_DENY) &&
2717 	    lrctl->rc_cursor->rcv_next != NULL) {
2718 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2719 
2720 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2721 		    p, lrctl->rc_cursor->rcv_value));
2722 
2723 	}
2724 	mutex_exit(&rset->rcs_lock);
2725 
2726 	return (ret);
2727 }
2728 
2729 /*
2730  * int rctl_test(rctl_hndl_t, rctl_set_t *, struct proc *, rctl_qty_t, uint_t)
2731  *
2732  * Overview
2733  *   Increment the resource associated with the given handle, returning zero if
2734  *   the incremented value does not exceed the threshold for the current limit
2735  *   on the resource.
2736  *
2737  * Return values
2738  *   Actions taken, according to the rctl_test bitmask.
2739  *
2740  * Caller's context
2741  *   p_lock held by caller.
2742  */
2743 /*ARGSUSED*/
2744 int
2745 rctl_test(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2746     rctl_qty_t incr, uint_t flags)
2747 {
2748 	return (rctl_test_entity(rhndl, rset, p, NULL, incr, flags));
2749 }
2750 
2751 int
2752 rctl_test_entity(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2753     rctl_entity_p_t *e, rctl_qty_t incr, uint_t flags)
2754 {
2755 	rctl_t *lrctl;
2756 	int ret = RCT_NONE;
2757 	rctl_entity_p_t e_tmp;
2758 	if (p == &p0) {
2759 		/*
2760 		 * We don't enforce rctls on the kernel itself.
2761 		 */
2762 		return (ret);
2763 	}
2764 
2765 rctl_test_acquire:
2766 	ASSERT(MUTEX_HELD(&p->p_lock));
2767 
2768 	mutex_enter(&rset->rcs_lock);
2769 
2770 	/*
2771 	 * Dereference from rctl_set.  We don't enforce newly loaded controls
2772 	 * that haven't been set on this entity (since the only valid value is
2773 	 * the infinite system value).
2774 	 */
2775 	if (rctl_set_find(rset, rhndl, &lrctl) == -1) {
2776 		mutex_exit(&rset->rcs_lock);
2777 		return (ret);
2778 	}
2779 
2780 	/*
2781 	 * This control is currently unenforced:  maximal value on control
2782 	 * supporting infinitely available resource.
2783 	 */
2784 	if ((lrctl->rc_dict_entry->rcd_flagaction & RCTL_GLOBAL_INFINITE) &&
2785 	    (lrctl->rc_cursor->rcv_flagaction & RCTL_LOCAL_MAXIMAL)) {
2786 
2787 		mutex_exit(&rset->rcs_lock);
2788 		return (ret);
2789 	}
2790 
2791 	/*
2792 	 * If we have been called by rctl_test, look up the entity pointer
2793 	 * from the proc pointer.
2794 	 */
2795 	if (e == NULL) {
2796 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2797 		p, &e_tmp);
2798 		e = &e_tmp;
2799 	}
2800 
2801 	/*
2802 	 * Get enforced rctl value and current usage.  Test the increment
2803 	 * with the current usage against the enforced value--take action as
2804 	 * necessary.
2805 	 */
2806 	while (RCTLOP_TEST(lrctl, p, e, lrctl->rc_cursor, incr, flags)) {
2807 		if ((ret & RCT_LK_ABANDONED) == 0) {
2808 			ret |= rctl_global_action(lrctl, rset, p,
2809 			    lrctl->rc_cursor);
2810 
2811 			RCTLOP_ACTION(lrctl, p, e);
2812 
2813 			ret |= rctl_local_action(lrctl, rset, p,
2814 			    lrctl->rc_cursor, flags);
2815 
2816 			if (ret & RCT_LK_ABANDONED)
2817 				goto rctl_test_acquire;
2818 		}
2819 
2820 		ret &= ~RCT_LK_ABANDONED;
2821 
2822 		if ((ret & RCT_DENY) == RCT_DENY ||
2823 		    lrctl->rc_cursor->rcv_next == NULL) {
2824 			ret |= RCT_DENY;
2825 			break;
2826 		}
2827 
2828 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2829 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2830 		    p, lrctl->rc_cursor->rcv_value));
2831 	}
2832 
2833 	mutex_exit(&rset->rcs_lock);
2834 
2835 	return (ret);
2836 }
2837 
2838 /*
2839  * void rctl_init(void)
2840  *
2841  * Overview
2842  *   Initialize the rctl subsystem, including the primoridal rctls
2843  *   provided by the system.  New subsystem-specific rctls should _not_ be
2844  *   initialized here.  (Do it in your own file.)
2845  *
2846  * Return values
2847  *   None.
2848  *
2849  * Caller's context
2850  *   Safe for KM_SLEEP allocations.  Must be called prior to any process model
2851  *   initialization.
2852  */
2853 void
2854 rctl_init(void)
2855 {
2856 	rctl_cache = kmem_cache_create("rctl_cache", sizeof (rctl_t),
2857 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2858 	rctl_val_cache = kmem_cache_create("rctl_val_cache",
2859 	    sizeof (rctl_val_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
2860 
2861 	rctl_dict = mod_hash_create_extended("rctl_dict",
2862 	    rctl_dict_size, mod_hash_null_keydtor, rctl_dict_val_dtor,
2863 	    rctl_dict_hash_by_id, NULL, rctl_dict_id_cmp, KM_SLEEP);
2864 	rctl_dict_by_name = mod_hash_create_strhash(
2865 	    "rctl_handles_by_name", rctl_dict_size,
2866 	    mod_hash_null_valdtor);
2867 	rctl_ids = id_space_create("rctl_ids", 1, max_rctl_hndl);
2868 	bzero(rctl_lists, (RC_MAX_ENTITY + 1) * sizeof (rctl_dict_entry_t *));
2869 
2870 	rctlproc_init();
2871 }
2872 
2873 /*
2874  * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc)
2875  *
2876  * Increments the amount of locked memory on a project, and
2877  * zone. If proj is NULL, the proj and zone of proc_t p is used.  If
2878  * chargeproc is non-zero, then the charged amount is cached on p->p_locked_mem
2879  * so that the charge can be migrated when a process changes projects.
2880  *
2881  * Return values
2882  *    0 - success
2883  *    EAGAIN - attempting to increment locked memory is denied by one
2884  *      or more resource entities.
2885  */
2886 int
2887 rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2888     int chargeproc)
2889 {
2890 	kproject_t *projp;
2891 	zone_t *zonep;
2892 	rctl_entity_p_t e;
2893 	int ret = 0;
2894 
2895 	ASSERT(p != NULL);
2896 	ASSERT(MUTEX_HELD(&p->p_lock));
2897 	if (proj != NULL) {
2898 		projp = proj;
2899 		zonep = zone_find_by_id(projp->kpj_zoneid);
2900 	} else {
2901 		projp = p->p_task->tk_proj;
2902 		zonep = p->p_zone;
2903 	}
2904 
2905 	mutex_enter(&zonep->zone_mem_lock);
2906 
2907 	e.rcep_p.proj = projp;
2908 	e.rcep_t = RCENTITY_PROJECT;
2909 	if (projp->kpj_data.kpd_locked_mem + inc >
2910 	    projp->kpj_data.kpd_locked_mem_ctl) {
2911 		if (rctl_test_entity(rc_project_locked_mem, projp->kpj_rctls,
2912 		    p, &e, inc, 0) & RCT_DENY) {
2913 			ret = EAGAIN;
2914 			goto out;
2915 		}
2916 	}
2917 	e.rcep_p.zone = zonep;
2918 	e.rcep_t = RCENTITY_ZONE;
2919 	if (zonep->zone_locked_mem + inc > zonep->zone_locked_mem_ctl) {
2920 		if (rctl_test_entity(rc_zone_locked_mem, zonep->zone_rctls,
2921 		    p, &e, inc, 0) & RCT_DENY) {
2922 			ret = EAGAIN;
2923 			goto out;
2924 		}
2925 	}
2926 
2927 	zonep->zone_locked_mem += inc;
2928 	projp->kpj_data.kpd_locked_mem += inc;
2929 	if (chargeproc != 0) {
2930 		p->p_locked_mem += inc;
2931 	}
2932 out:
2933 	mutex_exit(&zonep->zone_mem_lock);
2934 	if (proj != NULL)
2935 		zone_rele(zonep);
2936 	return (ret);
2937 }
2938 
2939 /*
2940  * rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc)
2941  *
2942  * Decrements the amount of locked memory on a project and
2943  * zone.  If proj is NULL, the proj and zone of proc_t p is used.  If
2944  * creditproc is non-zero, then the quantity of locked memory is subtracted
2945  * from p->p_locked_mem.
2946  *
2947  * Return values
2948  *   none
2949  */
2950 void
2951 rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2952     int creditproc)
2953 {
2954 	kproject_t *projp;
2955 	zone_t *zonep;
2956 
2957 	if (proj != NULL) {
2958 		projp = proj;
2959 		zonep = zone_find_by_id(projp->kpj_zoneid);
2960 	} else {
2961 		ASSERT(p != NULL);
2962 		ASSERT(MUTEX_HELD(&p->p_lock));
2963 		projp = p->p_task->tk_proj;
2964 		zonep = p->p_zone;
2965 	}
2966 
2967 	mutex_enter(&zonep->zone_mem_lock);
2968 	zonep->zone_locked_mem -= inc;
2969 	projp->kpj_data.kpd_locked_mem -= inc;
2970 	if (creditproc != 0) {
2971 		ASSERT(p != NULL);
2972 		ASSERT(MUTEX_HELD(&p->p_lock));
2973 		p->p_locked_mem -= inc;
2974 	}
2975 	mutex_exit(&zonep->zone_mem_lock);
2976 	if (proj != NULL)
2977 		zone_rele(zonep);
2978 }
2979 
2980 /*
2981  * rctl_incr_swap(proc_t *, zone_t *, size_t)
2982  *
2983  * Overview
2984  *   Increments the swap charge on the specified zone.
2985  *
2986  * Return values
2987  *   0 on success.  EAGAIN if swap increment fails due an rctl value
2988  *   on the zone.
2989  *
2990  * Callers context
2991  *   p_lock held on specified proc.
2992  *   swap must be even multiple of PAGESIZE
2993  */
2994 int
2995 rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap)
2996 {
2997 	rctl_entity_p_t e;
2998 
2999 	ASSERT(MUTEX_HELD(&proc->p_lock));
3000 	ASSERT((swap & PAGEOFFSET) == 0);
3001 	e.rcep_p.zone = zone;
3002 	e.rcep_t = RCENTITY_ZONE;
3003 
3004 	mutex_enter(&zone->zone_mem_lock);
3005 
3006 	if ((zone->zone_max_swap + swap) >
3007 	    zone->zone_max_swap_ctl) {
3008 
3009 		if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls,
3010 		    proc, &e, swap, 0) & RCT_DENY) {
3011 			mutex_exit(&zone->zone_mem_lock);
3012 			return (EAGAIN);
3013 		}
3014 	}
3015 	zone->zone_max_swap += swap;
3016 	mutex_exit(&zone->zone_mem_lock);
3017 	return (0);
3018 }
3019 
3020 /*
3021  * rctl_decr_swap(zone_t *, size_t)
3022  *
3023  * Overview
3024  *   Decrements the swap charge on the specified zone.
3025  *
3026  * Return values
3027  *   None
3028  *
3029  * Callers context
3030  *   swap must be even multiple of PAGESIZE
3031  */
3032 void
3033 rctl_decr_swap(zone_t *zone, size_t swap)
3034 {
3035 	ASSERT((swap & PAGEOFFSET) == 0);
3036 	mutex_enter(&zone->zone_mem_lock);
3037 	ASSERT(zone->zone_max_swap >= swap);
3038 	zone->zone_max_swap -= swap;
3039 	mutex_exit(&zone->zone_mem_lock);
3040 }
3041 
3042 /*
3043  * Create resource kstat
3044  */
3045 static kstat_t *
3046 rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class,
3047     uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid)
3048 {
3049 	kstat_t *ksp = NULL;
3050 	char name[KSTAT_STRLEN];
3051 
3052 	(void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance);
3053 
3054 	if ((ksp = kstat_create_zone("caps", ks_zoneid,
3055 		name, ks_class, ks_type,
3056 		ks_ndata, ks_flags, ks_zoneid)) != NULL) {
3057 		if (ks_zoneid != GLOBAL_ZONEID)
3058 			kstat_zone_add(ksp, GLOBAL_ZONEID);
3059 	}
3060 	return (ksp);
3061 }
3062 
3063 /*
3064  * Create zone-specific resource kstat
3065  */
3066 kstat_t *
3067 rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type,
3068     uint_t ks_ndata, uchar_t ks_flags)
3069 {
3070 	char name[KSTAT_STRLEN];
3071 
3072 	(void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name);
3073 
3074 	return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps",
3075 	    ks_type, ks_ndata, ks_flags, zone->zone_id));
3076 }
3077 
3078 /*
3079  * Create project-specific resource kstat
3080  */
3081 kstat_t *
3082 rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type,
3083     uint_t ks_ndata, uchar_t ks_flags)
3084 {
3085 	char name[KSTAT_STRLEN];
3086 
3087 	(void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name);
3088 
3089 	return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps",
3090 	    ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid));
3091 }
3092