xref: /titanic_41/usr/src/uts/common/os/rctl.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/atomic.h>
27 #include <sys/cmn_err.h>
28 #include <sys/id_space.h>
29 #include <sys/kmem.h>
30 #include <sys/kstat.h>
31 #include <sys/log.h>
32 #include <sys/modctl.h>
33 #include <sys/modhash.h>
34 #include <sys/mutex.h>
35 #include <sys/proc.h>
36 #include <sys/procset.h>
37 #include <sys/project.h>
38 #include <sys/resource.h>
39 #include <sys/rctl.h>
40 #include <sys/siginfo.h>
41 #include <sys/strlog.h>
42 #include <sys/systm.h>
43 #include <sys/task.h>
44 #include <sys/types.h>
45 #include <sys/policy.h>
46 #include <sys/zone.h>
47 
48 /*
49  * Resource controls (rctls)
50  *
51  *   The rctl subsystem provides a mechanism for kernel components to
52  *   register their individual resource controls with the system as a whole,
53  *   such that those controls can subscribe to specific actions while being
54  *   associated with the various process-model entities provided by the kernel:
55  *   the process, the task, the project, and the zone.  (In principle, only
56  *   minor modifications would be required to connect the resource control
57  *   functionality to non-process-model entities associated with the system.)
58  *
59  *   Subsystems register their rctls via rctl_register().  Subsystems
60  *   also wishing to provide additional limits on a given rctl can modify
61  *   them once they have the rctl handle.  Each subsystem should store the
62  *   handle to their rctl for direct access.
63  *
64  *   A primary dictionary, rctl_dict, contains a hash of id to the default
65  *   control definition for each controlled resource-entity pair on the system.
66  *   A secondary dictionary, rctl_dict_by_name, contains a hash of name to
67  *   resource control handles.  The resource control handles are distributed by
68  *   the rctl_ids ID space.  The handles are private and not to be
69  *   advertised to userland; all userland interactions are via the rctl
70  *   names.
71  *
72  *   Entities inherit their rctls from their predecessor.  Since projects have
73  *   no ancestor, they inherit their rctls from the rctl dict for project
74  *   rctls.  It is expected that project controls will be set to their
75  *   appropriate values shortly after project creation, presumably from a
76  *   policy source such as the project database.
77  *
78  * Data structures
79  *   The rctl_set_t attached to each of the process model entities is a simple
80  *   hash table keyed on the rctl handle assigned at registration.  The entries
81  *   in the hash table are rctl_t's, whose relationship with the active control
82  *   values on that resource and with the global state of the resource we
83  *   illustrate below:
84  *
85  *   rctl_dict[key] --> rctl_dict_entry
86  *			   ^
87  *			   |
88  *			+--+---+
89  *   rctl_set[key] ---> | rctl | --> value <-> value <-> system value --> NULL
90  *			+--+---+		 ^
91  *			   |			 |
92  *			   +------- cursor ------+
93  *
94  *   That is, the rctl contains a back pointer to the global resource control
95  *   state for this resource, which is also available in the rctl_dict hash
96  *   table mentioned earlier.  The rctl contains two pointers to resource
97  *   control values:  one, values, indicates the entire sequence of control
98  *   values; the other, cursor, indicates the currently active control
99  *   value--the next value to be enforced.  The value list itself is an open,
100  *   doubly-linked list, the last non-NULL member of which is the system value
101  *   for that resource (being the theoretical/conventional maximum allowable
102  *   value for the resource on this OS instance).
103  *
104  * Ops Vector
105  *   Subsystems publishing rctls need not provide instances of all of the
106  *   functions specified by the ops vector.  In particular, if general
107  *   rctl_*() entry points are not being called, certain functions can be
108  *   omitted.  These align as follows:
109  *
110  *   rctl_set()
111  *     You may wish to provide a set callback if locking circumstances prevent
112  *     it or if the performance cost of requesting the enforced value from the
113  *     resource control is prohibitively expensive.  For instance, the currently
114  *     enforced file size limit is stored on the process in the p_fsz_ctl to
115  *     maintain read()/write() performance.
116  *
117  *   rctl_test()
118  *     You must provide a test callback if you are using the rctl_test()
119  *     interface.  An action callback is optional.
120  *
121  *   rctl_action()
122  *     You may wish to provide an action callback.
123  *
124  * Registration
125  *   New resource controls can be added to a running instance by loaded modules
126  *   via registration.  (The current implementation does not support unloadable
127  *   modules; this functionality can be added if needed, via an
128  *   activation/deactivation interface involving the manipulation of the
129  *   ops vector for the resource control(s) needing to support unloading.)
130  *
131  * Control value ordering
132  *   Because the rctl_val chain on each rctl must be navigable in a
133  *   deterministic way, we have to define an ordering on the rctl_val_t's.  The
134  *   defined order is (flags & [maximal], value, flags & [deny-action],
135  *   privilege).
136  *
137  * Locking
138  *   rctl_dict_lock must be acquired prior to rctl_lists_lock.  Since
139  *   rctl_dict_lock or rctl_lists_lock can be called at the enforcement point
140  *   of any subsystem, holding subsystem locks, it is at all times inappropriate
141  *   to call kmem_alloc(., KM_SLEEP) while holding either of these locks.
142  *   Traversing any of the various resource control entity lists requires
143  *   holding rctl_lists_lock.
144  *
145  *   Each individual resource control set associated with an entity must have
146  *   its rcs_lock held for the duration of any operations that would add
147  *   resource controls or control values to the set.
148  *
149  *   The locking subsequence of interest is: p_lock, rctl_dict_lock,
150  *   rctl_lists_lock, entity->rcs_lock.
151  *
152  * The projects(4) database and project entity resource controls
153  *   A special case is made for RCENTITY_PROJECT values set through the
154  *   setproject(3PROJECT) interface.  setproject() makes use of a private
155  *   interface, setprojrctl(), which passes through an array of resource control
156  *   blocks that need to be set while holding the entity->rcs_lock.  This
157  *   ensures that the act of modifying a project's resource controls is
158  *   "atomic" within the kernel.
159  *
160  *   Within the rctl sub-system, we provide two interfaces that are only used by
161  *   the setprojrctl() code path - rctl_local_insert_all() and
162  *   rctl_local_replace_all().  rctl_local_insert_all() will ensure that the
163  *   resource values specified in *new_values are applied.
164  *   rctl_local_replace_all() will purge the current rctl->rc_projdb and
165  *   rctl->rc_values entries, and apply the *new_values.
166  *
167  *   These functions modify not only the linked list of active resource controls
168  *   (rctl->rc_values), but also a "cached" linked list (rctl->rc_projdb) of
169  *   values set through these interfaces.  To clarify:
170  *
171  *      rctl->rc_values - a linked list of rctl_val_t.  These are the active
172  *      resource values associated with this rctl, and may have been set by
173  *      setrctl() - via prctl(1M), or by setprojrctl() - via
174  *      setproject(3PROJECT).
175  *
176  *      rctl->rc_projdb - a linked list of rctl_val_t.  These reflect the
177  *      resource values set by the setprojrctl() code path.  rc_projdb is not
178  *      referenced by any other component of the rctl sub-system.
179  *
180  *   As various locks are held when calling these functions, we ensure that all
181  *   the possible memory allocations are performed prior to calling the
182  *   function.  *alloc_values is a linked list of uninitialized rctl_val_t,
183  *   which may be used to duplicate a new resource control value (passed in as
184  *   one of the members of the *new_values linked list), in order to populate
185  *   rctl->rc_values.
186  */
187 
188 id_t max_rctl_hndl = 32768;
189 int rctl_dict_size = 64;
190 int rctl_set_size = 8;
191 kmutex_t rctl_dict_lock;
192 mod_hash_t *rctl_dict;
193 mod_hash_t *rctl_dict_by_name;
194 id_space_t *rctl_ids;
195 kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
196 kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
197 
198 kmutex_t rctl_lists_lock;
199 rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
200 
201 /*
202  * Default resource control operations and ops vector
203  *   To be used if the particular rcontrol has no specific actions defined, or
204  *   if the subsystem providing the control is quiescing (in preparation for
205  *   unloading, presumably.)
206  *
207  *   Resource controls with callbacks should fill the unused operations with the
208  *   appropriate default impotent callback.
209  */
210 /*ARGSUSED*/
211 void
212 rcop_no_action(struct rctl *r, struct proc *p, rctl_entity_p_t *e)
213 {
214 }
215 
216 /*ARGSUSED*/
217 rctl_qty_t
218 rcop_no_usage(struct rctl *r, struct proc *p)
219 {
220 	return (0);
221 }
222 
223 /*ARGSUSED*/
224 int
225 rcop_no_set(struct rctl *r, struct proc *p, rctl_entity_p_t *e, rctl_qty_t l)
226 {
227 	return (0);
228 }
229 
230 /*ARGSUSED*/
231 int
232 rcop_no_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
233     struct rctl_val *rv, rctl_qty_t i, uint_t f)
234 {
235 	return (0);
236 }
237 
238 rctl_ops_t rctl_default_ops = {
239 	rcop_no_action,
240 	rcop_no_usage,
241 	rcop_no_set,
242 	rcop_no_test
243 };
244 
245 /*
246  * Default "absolute" resource control operation and ops vector
247  *   Useful if there is no usage associated with the
248  *   resource control.
249  */
250 /*ARGSUSED*/
251 int
252 rcop_absolute_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
253     struct rctl_val *rv, rctl_qty_t i, uint_t f)
254 {
255 	return (i > rv->rcv_value);
256 }
257 
258 rctl_ops_t rctl_absolute_ops = {
259 	rcop_no_action,
260 	rcop_no_usage,
261 	rcop_no_set,
262 	rcop_absolute_test
263 };
264 
265 /*ARGSUSED*/
266 static uint_t
267 rctl_dict_hash_by_id(void *hash_data, mod_hash_key_t key)
268 {
269 	return ((uint_t)(uintptr_t)key % rctl_dict_size);
270 }
271 
272 static int
273 rctl_dict_id_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
274 {
275 	uint_t u1 = (uint_t)(uintptr_t)key1;
276 	uint_t u2 = (uint_t)(uintptr_t)key2;
277 
278 	if (u1 > u2)
279 		return (1);
280 
281 	if (u1 == u2)
282 		return (0);
283 
284 	return (-1);
285 }
286 
287 static void
288 rctl_dict_val_dtor(mod_hash_val_t val)
289 {
290 	rctl_dict_entry_t *kr = (rctl_dict_entry_t *)val;
291 
292 	kmem_free(kr, sizeof (rctl_dict_entry_t));
293 }
294 
295 /*
296  * size_t rctl_build_name_buf()
297  *
298  * Overview
299  *   rctl_build_name_buf() walks all active resource controls in the dictionary,
300  *   building a buffer of continguous NUL-terminated strings.
301  *
302  * Return values
303  *   The size of the buffer is returned, the passed pointer's contents are
304  *   modified to that of the location of the buffer.
305  *
306  * Caller's context
307  *   Caller must be in a context suitable for KM_SLEEP allocations.
308  */
309 size_t
310 rctl_build_name_buf(char **rbufp)
311 {
312 	size_t req_size, cpy_size;
313 	char *rbufloc;
314 	int i;
315 
316 rctl_rebuild_name_buf:
317 	req_size = cpy_size = 0;
318 
319 	/*
320 	 * Calculate needed buffer length.
321 	 */
322 	mutex_enter(&rctl_lists_lock);
323 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
324 		rctl_dict_entry_t *rde;
325 
326 		for (rde = rctl_lists[i];
327 		    rde != NULL;
328 		    rde = rde->rcd_next)
329 			req_size += strlen(rde->rcd_name) + 1;
330 	}
331 	mutex_exit(&rctl_lists_lock);
332 
333 	rbufloc = *rbufp = kmem_alloc(req_size, KM_SLEEP);
334 
335 	/*
336 	 * Copy rctl names into our buffer.  If the copy length exceeds the
337 	 * allocate length (due to registration changes), stop copying, free the
338 	 * buffer, and start again.
339 	 */
340 	mutex_enter(&rctl_lists_lock);
341 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
342 		rctl_dict_entry_t *rde;
343 
344 		for (rde = rctl_lists[i];
345 		    rde != NULL;
346 		    rde = rde->rcd_next) {
347 			size_t length = strlen(rde->rcd_name) + 1;
348 
349 			cpy_size += length;
350 
351 			if (cpy_size > req_size) {
352 				kmem_free(*rbufp, req_size);
353 				mutex_exit(&rctl_lists_lock);
354 				goto rctl_rebuild_name_buf;
355 			}
356 
357 			bcopy(rde->rcd_name, rbufloc, length);
358 			rbufloc += length;
359 		}
360 	}
361 	mutex_exit(&rctl_lists_lock);
362 
363 	return (req_size);
364 }
365 
366 /*
367  * rctl_dict_entry_t *rctl_dict_lookup(const char *)
368  *
369  * Overview
370  *   rctl_dict_lookup() returns the resource control dictionary entry for the
371  *   named resource control.
372  *
373  * Return values
374  *   A pointer to the appropriate resource control dictionary entry, or NULL if
375  *   no such named entry exists.
376  *
377  * Caller's context
378  *   Caller must not be holding rctl_dict_lock.
379  */
380 rctl_dict_entry_t *
381 rctl_dict_lookup(const char *name)
382 {
383 	rctl_dict_entry_t *rde;
384 
385 	mutex_enter(&rctl_dict_lock);
386 
387 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
388 	    (mod_hash_val_t *)&rde) == MH_ERR_NOTFOUND) {
389 		mutex_exit(&rctl_dict_lock);
390 		return (NULL);
391 	}
392 
393 	mutex_exit(&rctl_dict_lock);
394 
395 	return (rde);
396 }
397 
398 /*
399  * rctl_hndl_t rctl_hndl_lookup(const char *)
400  *
401  * Overview
402  *   rctl_hndl_lookup() returns the resource control id (the "handle") for the
403  *   named resource control.
404  *
405  * Return values
406  *   The appropriate id, or -1 if no such named entry exists.
407  *
408  * Caller's context
409  *   Caller must not be holding rctl_dict_lock.
410  */
411 rctl_hndl_t
412 rctl_hndl_lookup(const char *name)
413 {
414 	rctl_dict_entry_t *rde;
415 
416 	if ((rde = rctl_dict_lookup(name)) == NULL)
417 		return (-1);
418 
419 	return (rde->rcd_id);
420 }
421 
422 /*
423  * rctl_dict_entry_t * rctl_dict_lookup_hndl(rctl_hndl_t)
424  *
425  * Overview
426  *   rctl_dict_lookup_hndl() completes the public lookup functions, by returning
427  *   the resource control dictionary entry matching a given resource control id.
428  *
429  * Return values
430  *   A pointer to the matching resource control dictionary entry, or NULL if the
431  *   id does not match any existing entries.
432  *
433  * Caller's context
434  *   Caller must not be holding rctl_lists_lock.
435  */
436 rctl_dict_entry_t *
437 rctl_dict_lookup_hndl(rctl_hndl_t hndl)
438 {
439 	uint_t i;
440 
441 	mutex_enter(&rctl_lists_lock);
442 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
443 		rctl_dict_entry_t *rde;
444 
445 		for (rde = rctl_lists[i];
446 		    rde != NULL;
447 		    rde = rde->rcd_next)
448 			if (rde->rcd_id == hndl) {
449 				mutex_exit(&rctl_lists_lock);
450 				return (rde);
451 			}
452 	}
453 	mutex_exit(&rctl_lists_lock);
454 
455 	return (NULL);
456 }
457 
458 /*
459  * void rctl_add_default_limit(const char *name, rctl_qty_t value,
460  *     rctl_priv_t privilege, uint_t action)
461  *
462  * Overview
463  *   Create a default limit with specified value, privilege, and action.
464  *
465  * Return value
466  *   No value returned.
467  */
468 void
469 rctl_add_default_limit(const char *name, rctl_qty_t value,
470     rctl_priv_t privilege, uint_t action)
471 {
472 	rctl_val_t *dval;
473 	rctl_dict_entry_t *rde;
474 
475 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
476 	bzero(dval, sizeof (rctl_val_t));
477 	dval->rcv_value = value;
478 	dval->rcv_privilege = privilege;
479 	dval->rcv_flagaction = action;
480 	dval->rcv_action_recip_pid = -1;
481 
482 	rde = rctl_dict_lookup(name);
483 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
484 }
485 
486 /*
487  * void rctl_add_legacy_limit(const char *name, const char *mname,
488  *     const char *lname, rctl_qty_t dflt)
489  *
490  * Overview
491  *   Create a default privileged limit, using the value obtained from
492  *   /etc/system if it exists and is greater than the specified default
493  *   value.  Exists primarily for System V IPC.
494  *
495  * Return value
496  *   No value returned.
497  */
498 void
499 rctl_add_legacy_limit(const char *name, const char *mname, const char *lname,
500     rctl_qty_t dflt, rctl_qty_t max)
501 {
502 	rctl_qty_t qty;
503 
504 	if (!mod_sysvar(mname, lname, &qty) || (qty < dflt))
505 		qty = dflt;
506 
507 	if (qty > max)
508 		qty = max;
509 
510 	rctl_add_default_limit(name, qty, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
511 }
512 
513 rctl_set_t *
514 rctl_entity_obtain_rset(rctl_dict_entry_t *rcd, struct proc *p)
515 {
516 	rctl_set_t *rset = NULL;
517 
518 	if (rcd == NULL)
519 		return (NULL);
520 
521 	switch (rcd->rcd_entity) {
522 	case RCENTITY_PROCESS:
523 		rset = p->p_rctls;
524 		break;
525 	case RCENTITY_TASK:
526 		ASSERT(MUTEX_HELD(&p->p_lock));
527 		if (p->p_task != NULL)
528 			rset = p->p_task->tk_rctls;
529 		break;
530 	case RCENTITY_PROJECT:
531 		ASSERT(MUTEX_HELD(&p->p_lock));
532 		if (p->p_task != NULL &&
533 		    p->p_task->tk_proj != NULL)
534 			rset = p->p_task->tk_proj->kpj_rctls;
535 		break;
536 	case RCENTITY_ZONE:
537 		ASSERT(MUTEX_HELD(&p->p_lock));
538 		if (p->p_zone != NULL)
539 			rset = p->p_zone->zone_rctls;
540 		break;
541 	default:
542 		panic("unknown rctl entity type %d seen", rcd->rcd_entity);
543 		break;
544 	}
545 
546 	return (rset);
547 }
548 
549 static void
550 rctl_entity_obtain_entity_p(rctl_entity_t entity, struct proc *p,
551     rctl_entity_p_t *e)
552 {
553 	e->rcep_p.proc = NULL;
554 	e->rcep_t = entity;
555 
556 	switch (entity) {
557 	case RCENTITY_PROCESS:
558 		e->rcep_p.proc = p;
559 		break;
560 	case RCENTITY_TASK:
561 		ASSERT(MUTEX_HELD(&p->p_lock));
562 		if (p->p_task != NULL)
563 			e->rcep_p.task = p->p_task;
564 		break;
565 	case RCENTITY_PROJECT:
566 		ASSERT(MUTEX_HELD(&p->p_lock));
567 		if (p->p_task != NULL &&
568 		    p->p_task->tk_proj != NULL)
569 			e->rcep_p.proj = p->p_task->tk_proj;
570 		break;
571 	case RCENTITY_ZONE:
572 		ASSERT(MUTEX_HELD(&p->p_lock));
573 		if (p->p_zone != NULL)
574 			e->rcep_p.zone = p->p_zone;
575 		break;
576 	default:
577 		panic("unknown rctl entity type %d seen", entity);
578 		break;
579 	}
580 }
581 
582 static void
583 rctl_gp_alloc(rctl_alloc_gp_t *rcgp)
584 {
585 	uint_t i;
586 
587 	if (rcgp->rcag_nctls > 0) {
588 		rctl_t *prev = kmem_cache_alloc(rctl_cache, KM_SLEEP);
589 		rctl_t *rctl = prev;
590 
591 		rcgp->rcag_ctls = prev;
592 
593 		for (i = 1; i < rcgp->rcag_nctls; i++) {
594 			rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
595 			prev->rc_next = rctl;
596 			prev = rctl;
597 		}
598 
599 		rctl->rc_next = NULL;
600 	}
601 
602 	if (rcgp->rcag_nvals > 0) {
603 		rctl_val_t *prev = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
604 		rctl_val_t *rval = prev;
605 
606 		rcgp->rcag_vals = prev;
607 
608 		for (i = 1; i < rcgp->rcag_nvals; i++) {
609 			rval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
610 			prev->rcv_next = rval;
611 			prev = rval;
612 		}
613 
614 		rval->rcv_next = NULL;
615 	}
616 
617 }
618 
619 static rctl_val_t *
620 rctl_gp_detach_val(rctl_alloc_gp_t *rcgp)
621 {
622 	rctl_val_t *rval = rcgp->rcag_vals;
623 
624 	ASSERT(rcgp->rcag_nvals > 0);
625 	rcgp->rcag_nvals--;
626 	rcgp->rcag_vals = rval->rcv_next;
627 
628 	rval->rcv_next = NULL;
629 
630 	return (rval);
631 }
632 
633 static rctl_t *
634 rctl_gp_detach_ctl(rctl_alloc_gp_t *rcgp)
635 {
636 	rctl_t *rctl = rcgp->rcag_ctls;
637 
638 	ASSERT(rcgp->rcag_nctls > 0);
639 	rcgp->rcag_nctls--;
640 	rcgp->rcag_ctls = rctl->rc_next;
641 
642 	rctl->rc_next = NULL;
643 
644 	return (rctl);
645 
646 }
647 
648 static void
649 rctl_gp_free(rctl_alloc_gp_t *rcgp)
650 {
651 	rctl_val_t *rval = rcgp->rcag_vals;
652 	rctl_t *rctl = rcgp->rcag_ctls;
653 
654 	while (rval != NULL) {
655 		rctl_val_t *next = rval->rcv_next;
656 
657 		kmem_cache_free(rctl_val_cache, rval);
658 		rval = next;
659 	}
660 
661 	while (rctl != NULL) {
662 		rctl_t *next = rctl->rc_next;
663 
664 		kmem_cache_free(rctl_cache, rctl);
665 		rctl = next;
666 	}
667 }
668 
669 /*
670  * void rctl_prealloc_destroy(rctl_alloc_gp_t *)
671  *
672  * Overview
673  *   Release all unused memory allocated via one of the "prealloc" functions:
674  *   rctl_set_init_prealloc, rctl_set_dup_prealloc, or rctl_rlimit_set_prealloc.
675  *
676  * Return values
677  *   None.
678  *
679  * Caller's context
680  *   No restrictions on context.
681  */
682 void
683 rctl_prealloc_destroy(rctl_alloc_gp_t *gp)
684 {
685 	rctl_gp_free(gp);
686 	kmem_free(gp, sizeof (rctl_alloc_gp_t));
687 }
688 
689 /*
690  * int rctl_val_cmp(rctl_val_t *, rctl_val_t *, int)
691  *
692  * Overview
693  *   This function defines an ordering to rctl_val_t's in order to allow
694  *   for correct placement in value lists. When the imprecise flag is set,
695  *   the action recipient is ignored. This is to facilitate insert,
696  *   delete, and replace operations by rctlsys.
697  *
698  * Return values
699  *   0 if the val_t's are are considered identical
700  *   -1 if a is ordered lower than b
701  *   1 if a is lowered higher than b
702  *
703  * Caller's context
704  *   No restrictions on context.
705  */
706 int
707 rctl_val_cmp(rctl_val_t *a, rctl_val_t *b, int imprecise)
708 {
709 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) <
710 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
711 		return (-1);
712 
713 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) >
714 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
715 		return (1);
716 
717 	if (a->rcv_value < b->rcv_value)
718 		return (-1);
719 
720 	if (a->rcv_value > b->rcv_value)
721 		return (1);
722 
723 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) <
724 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
725 		return (-1);
726 
727 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) >
728 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
729 		return (1);
730 
731 	if (a->rcv_privilege < b->rcv_privilege)
732 		return (-1);
733 
734 	if (a->rcv_privilege > b->rcv_privilege)
735 		return (1);
736 
737 	if (imprecise)
738 		return (0);
739 
740 	if (a->rcv_action_recip_pid < b->rcv_action_recip_pid)
741 		return (-1);
742 
743 	if (a->rcv_action_recip_pid > b->rcv_action_recip_pid)
744 		return (1);
745 
746 	return (0);
747 }
748 
749 static rctl_val_t *
750 rctl_val_list_find(rctl_val_t **head, rctl_val_t *cval)
751 {
752 	rctl_val_t *rval = *head;
753 
754 	while (rval != NULL) {
755 		if (rctl_val_cmp(cval, rval, 0) == 0)
756 			return (rval);
757 
758 		rval = rval->rcv_next;
759 	}
760 
761 	return (NULL);
762 
763 }
764 
765 /*
766  * int rctl_val_list_insert(rctl_val_t **, rctl_val_t *)
767  *
768  * Overview
769  *   This function inserts the rctl_val_t into the value list provided.
770  *   The insert is always successful unless if the value is a duplicate
771  *   of one already in the list.
772  *
773  * Return values
774  *    1 if the value was a duplicate of an existing value in the list.
775  *    0 if the insert was successful.
776  */
777 int
778 rctl_val_list_insert(rctl_val_t **root, rctl_val_t *rval)
779 {
780 	rctl_val_t *prev;
781 	int equiv;
782 
783 	rval->rcv_next = NULL;
784 	rval->rcv_prev = NULL;
785 
786 	if (*root == NULL) {
787 		*root = rval;
788 		return (0);
789 	}
790 
791 	equiv = rctl_val_cmp(rval, *root, 0);
792 
793 	if (equiv == 0)
794 		return (1);
795 
796 	if (equiv < 0) {
797 		rval->rcv_next = *root;
798 		rval->rcv_next->rcv_prev = rval;
799 		*root = rval;
800 
801 		return (0);
802 	}
803 
804 	prev = *root;
805 	while (prev->rcv_next != NULL &&
806 	    (equiv = rctl_val_cmp(rval, prev->rcv_next, 0)) > 0) {
807 		prev = prev->rcv_next;
808 	}
809 
810 	if (equiv == 0)
811 		return (1);
812 
813 	rval->rcv_next = prev->rcv_next;
814 	if (rval->rcv_next != NULL)
815 		rval->rcv_next->rcv_prev = rval;
816 	prev->rcv_next = rval;
817 	rval->rcv_prev = prev;
818 
819 	return (0);
820 }
821 
822 static int
823 rctl_val_list_delete(rctl_val_t **root, rctl_val_t *rval)
824 {
825 	rctl_val_t *prev;
826 
827 	if (*root == NULL)
828 		return (-1);
829 
830 	prev = *root;
831 	if (rctl_val_cmp(rval, prev, 0) == 0) {
832 		*root = prev->rcv_next;
833 		if (*root != NULL)
834 			(*root)->rcv_prev = NULL;
835 
836 		kmem_cache_free(rctl_val_cache, prev);
837 
838 		return (0);
839 	}
840 
841 	while (prev->rcv_next != NULL &&
842 	    rctl_val_cmp(rval, prev->rcv_next, 0) != 0) {
843 		prev = prev->rcv_next;
844 	}
845 
846 	if (prev->rcv_next == NULL) {
847 		/*
848 		 * If we navigate the entire list and cannot find a match, then
849 		 * return failure.
850 		 */
851 		return (-1);
852 	}
853 
854 	prev = prev->rcv_next;
855 	prev->rcv_prev->rcv_next = prev->rcv_next;
856 	if (prev->rcv_next != NULL)
857 		prev->rcv_next->rcv_prev = prev->rcv_prev;
858 
859 	kmem_cache_free(rctl_val_cache, prev);
860 
861 	return (0);
862 }
863 
864 static rctl_val_t *
865 rctl_val_list_dup(rctl_val_t *rval, rctl_alloc_gp_t *ragp, struct proc *oldp,
866     struct proc *newp)
867 {
868 	rctl_val_t *head = NULL;
869 
870 	for (; rval != NULL; rval = rval->rcv_next) {
871 		rctl_val_t *dval = rctl_gp_detach_val(ragp);
872 
873 		bcopy(rval, dval, sizeof (rctl_val_t));
874 		dval->rcv_prev = dval->rcv_next = NULL;
875 
876 		if (oldp == NULL ||
877 		    rval->rcv_action_recipient == NULL ||
878 		    rval->rcv_action_recipient == oldp) {
879 			if (rval->rcv_privilege == RCPRIV_BASIC) {
880 				dval->rcv_action_recipient = newp;
881 				dval->rcv_action_recip_pid = newp->p_pid;
882 			} else {
883 				dval->rcv_action_recipient = NULL;
884 				dval->rcv_action_recip_pid = -1;
885 			}
886 
887 			(void) rctl_val_list_insert(&head, dval);
888 		} else {
889 			kmem_cache_free(rctl_val_cache, dval);
890 		}
891 	}
892 
893 	return (head);
894 }
895 
896 static void
897 rctl_val_list_reset(rctl_val_t *rval)
898 {
899 	for (; rval != NULL; rval = rval->rcv_next)
900 		rval->rcv_firing_time = 0;
901 }
902 
903 static uint_t
904 rctl_val_list_count(rctl_val_t *rval)
905 {
906 	uint_t n = 0;
907 
908 	for (; rval != NULL; rval = rval->rcv_next)
909 		n++;
910 
911 	return (n);
912 }
913 
914 
915 static void
916 rctl_val_list_free(rctl_val_t *rval)
917 {
918 	while (rval != NULL) {
919 		rctl_val_t *next = rval->rcv_next;
920 
921 		kmem_cache_free(rctl_val_cache, rval);
922 
923 		rval = next;
924 	}
925 }
926 
927 /*
928  * rctl_qty_t rctl_model_maximum(rctl_dict_entry_t *, struct proc *)
929  *
930  * Overview
931  *   In cases where the operating system supports more than one process
932  *   addressing model, the operating system capabilities will exceed those of
933  *   one or more of these models.  Processes in a less capable model must have
934  *   their resources accurately controlled, without diluting those of their
935  *   descendants reached via exec().  rctl_model_maximum() returns the governing
936  *   value for the specified process with respect to a resource control, such
937  *   that the value can used for the RCTLOP_SET callback or compatability
938  *   support.
939  *
940  * Return values
941  *   The maximum value for the given process for the specified resource control.
942  *
943  * Caller's context
944  *   No restrictions on context.
945  */
946 rctl_qty_t
947 rctl_model_maximum(rctl_dict_entry_t *rde, struct proc *p)
948 {
949 	if (p->p_model == DATAMODEL_NATIVE)
950 		return (rde->rcd_max_native);
951 
952 	return (rde->rcd_max_ilp32);
953 }
954 
955 /*
956  * rctl_qty_t rctl_model_value(rctl_dict_entry_t *, struct proc *, rctl_qty_t)
957  *
958  * Overview
959  *   Convenience function wrapping the rctl_model_maximum() functionality.
960  *
961  * Return values
962  *   The lesser of the process's maximum value and the given value for the
963  *   specified resource control.
964  *
965  * Caller's context
966  *   No restrictions on context.
967  */
968 rctl_qty_t
969 rctl_model_value(rctl_dict_entry_t *rde, struct proc *p, rctl_qty_t value)
970 {
971 	rctl_qty_t max = rctl_model_maximum(rde, p);
972 
973 	return (value < max ? value : max);
974 }
975 
976 static void
977 rctl_set_insert(rctl_set_t *set, rctl_hndl_t hndl, rctl_t *rctl)
978 {
979 	uint_t index = hndl % rctl_set_size;
980 	rctl_t *next_ctl, *prev_ctl;
981 
982 	ASSERT(MUTEX_HELD(&set->rcs_lock));
983 
984 	rctl->rc_next = NULL;
985 
986 	if (set->rcs_ctls[index] == NULL) {
987 		set->rcs_ctls[index] = rctl;
988 		return;
989 	}
990 
991 	if (hndl < set->rcs_ctls[index]->rc_id) {
992 		rctl->rc_next = set->rcs_ctls[index];
993 		set->rcs_ctls[index] = rctl;
994 
995 		return;
996 	}
997 
998 	for (next_ctl = set->rcs_ctls[index]->rc_next,
999 	    prev_ctl = set->rcs_ctls[index];
1000 	    next_ctl != NULL;
1001 	    prev_ctl = next_ctl,
1002 	    next_ctl = next_ctl->rc_next) {
1003 		if (next_ctl->rc_id > hndl) {
1004 			rctl->rc_next = next_ctl;
1005 			prev_ctl->rc_next = rctl;
1006 
1007 			return;
1008 		}
1009 	}
1010 
1011 	rctl->rc_next = next_ctl;
1012 	prev_ctl->rc_next = rctl;
1013 }
1014 
1015 /*
1016  * rctl_set_t *rctl_set_create()
1017  *
1018  * Overview
1019  *   Create an empty resource control set, suitable for attaching to a
1020  *   controlled entity.
1021  *
1022  * Return values
1023  *   A pointer to the newly created set.
1024  *
1025  * Caller's context
1026  *   Safe for KM_SLEEP allocations.
1027  */
1028 rctl_set_t *
1029 rctl_set_create()
1030 {
1031 	rctl_set_t *rset = kmem_zalloc(sizeof (rctl_set_t), KM_SLEEP);
1032 
1033 	mutex_init(&rset->rcs_lock, NULL, MUTEX_DEFAULT, NULL);
1034 	rset->rcs_ctls = kmem_zalloc(rctl_set_size * sizeof (rctl_t *),
1035 	    KM_SLEEP);
1036 	rset->rcs_entity = -1;
1037 
1038 	return (rset);
1039 }
1040 
1041 /*
1042  * rctl_gp_alloc_t *rctl_set_init_prealloc(rctl_entity_t)
1043  *
1044  * Overview
1045  *    rctl_set_init_prealloc() examines the globally defined resource controls
1046  *    and their default values and returns a resource control allocation group
1047  *    populated with sufficient controls and values to form a representative
1048  *    resource control set for the specified entity.
1049  *
1050  * Return values
1051  *    A pointer to the newly created allocation group.
1052  *
1053  * Caller's context
1054  *    Caller must be in a context suitable for KM_SLEEP allocations.
1055  */
1056 rctl_alloc_gp_t *
1057 rctl_set_init_prealloc(rctl_entity_t entity)
1058 {
1059 	rctl_dict_entry_t *rde;
1060 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1061 
1062 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1063 
1064 	if (rctl_lists[entity] == NULL)
1065 		return (ragp);
1066 
1067 	mutex_enter(&rctl_lists_lock);
1068 
1069 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1070 		ragp->rcag_nctls++;
1071 		ragp->rcag_nvals += rctl_val_list_count(rde->rcd_default_value);
1072 	}
1073 
1074 	mutex_exit(&rctl_lists_lock);
1075 
1076 	rctl_gp_alloc(ragp);
1077 
1078 	return (ragp);
1079 }
1080 
1081 /*
1082  * rctl_set_t *rctl_set_init(rctl_entity_t)
1083  *
1084  * Overview
1085  *   rctl_set_create() creates a resource control set, initialized with the
1086  *   system infinite values on all registered controls, for attachment to a
1087  *   system entity requiring resource controls, such as a process or a task.
1088  *
1089  * Return values
1090  *   A pointer to the newly filled set.
1091  *
1092  * Caller's context
1093  *   Caller must be holding p_lock on entry so that RCTLOP_SET() functions
1094  *   may modify task and project members based on the proc structure
1095  *   they are passed.
1096  */
1097 rctl_set_t *
1098 rctl_set_init(rctl_entity_t entity, struct proc *p, rctl_entity_p_t *e,
1099     rctl_set_t *rset, rctl_alloc_gp_t *ragp)
1100 {
1101 	rctl_dict_entry_t *rde;
1102 
1103 	ASSERT(MUTEX_HELD(&p->p_lock));
1104 	ASSERT(e);
1105 	rset->rcs_entity = entity;
1106 
1107 	if (rctl_lists[entity] == NULL)
1108 		return (rset);
1109 
1110 	mutex_enter(&rctl_lists_lock);
1111 	mutex_enter(&rset->rcs_lock);
1112 
1113 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1114 		rctl_t *rctl = rctl_gp_detach_ctl(ragp);
1115 
1116 		rctl->rc_dict_entry = rde;
1117 		rctl->rc_id = rde->rcd_id;
1118 		rctl->rc_projdb = NULL;
1119 
1120 		rctl->rc_values = rctl_val_list_dup(rde->rcd_default_value,
1121 		    ragp, NULL, p);
1122 		rctl->rc_cursor = rctl->rc_values;
1123 
1124 		ASSERT(rctl->rc_cursor != NULL);
1125 
1126 		rctl_set_insert(rset, rde->rcd_id, rctl);
1127 
1128 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1129 		    rctl->rc_cursor->rcv_value));
1130 	}
1131 
1132 	mutex_exit(&rset->rcs_lock);
1133 	mutex_exit(&rctl_lists_lock);
1134 
1135 	return (rset);
1136 }
1137 
1138 static rctl_t *
1139 rctl_dup(rctl_t *rctl, rctl_alloc_gp_t *ragp, struct proc *oldp,
1140     struct proc *newp)
1141 {
1142 	rctl_t *dup = rctl_gp_detach_ctl(ragp);
1143 	rctl_val_t *dval;
1144 
1145 	dup->rc_id = rctl->rc_id;
1146 	dup->rc_dict_entry = rctl->rc_dict_entry;
1147 	dup->rc_next = NULL;
1148 	dup->rc_cursor = NULL;
1149 	dup->rc_values = rctl_val_list_dup(rctl->rc_values, ragp, oldp, newp);
1150 
1151 	for (dval = dup->rc_values;
1152 	    dval != NULL; dval = dval->rcv_next) {
1153 		if (rctl_val_cmp(rctl->rc_cursor, dval, 0) >= 0) {
1154 			dup->rc_cursor = dval;
1155 			break;
1156 		}
1157 	}
1158 
1159 	if (dup->rc_cursor == NULL)
1160 		dup->rc_cursor = dup->rc_values;
1161 
1162 	return (dup);
1163 }
1164 
1165 static void
1166 rctl_set_fill_alloc_gp(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1167 {
1168 	uint_t i;
1169 
1170 	bzero(ragp, sizeof (rctl_alloc_gp_t));
1171 
1172 	for (i = 0; i < rctl_set_size; i++) {
1173 		rctl_t *r = set->rcs_ctls[i];
1174 
1175 		while (r != NULL) {
1176 			ragp->rcag_nctls++;
1177 
1178 			ragp->rcag_nvals += rctl_val_list_count(r->rc_values);
1179 
1180 			r = r->rc_next;
1181 		}
1182 	}
1183 }
1184 
1185 /*
1186  * rctl_alloc_gp_t *rctl_set_dup_prealloc(rctl_set_t *)
1187  *
1188  * Overview
1189  *   Given a resource control set, allocate a sufficiently large allocation
1190  *   group to contain a duplicate of the set.
1191  *
1192  * Return value
1193  *   A pointer to the newly created allocation group.
1194  *
1195  * Caller's context
1196  *   Safe for KM_SLEEP allocations.
1197  */
1198 rctl_alloc_gp_t *
1199 rctl_set_dup_prealloc(rctl_set_t *set)
1200 {
1201 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1202 
1203 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1204 
1205 	mutex_enter(&set->rcs_lock);
1206 	rctl_set_fill_alloc_gp(set, ragp);
1207 	mutex_exit(&set->rcs_lock);
1208 
1209 	rctl_gp_alloc(ragp);
1210 
1211 	return (ragp);
1212 }
1213 
1214 /*
1215  * int rctl_set_dup_ready(rctl_set_t *, rctl_alloc_gp_t *)
1216  *
1217  * Overview
1218  *   Verify that the allocation group provided is large enough to allow a
1219  *   duplicate of the given resource control set to be constructed from its
1220  *   contents.
1221  *
1222  * Return values
1223  *   1 if the allocation group is sufficiently large, 0 otherwise.
1224  *
1225  * Caller's context
1226  *   rcs_lock must be held prior to entry.
1227  */
1228 int
1229 rctl_set_dup_ready(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1230 {
1231 	rctl_alloc_gp_t curr_gp;
1232 
1233 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1234 
1235 	rctl_set_fill_alloc_gp(set, &curr_gp);
1236 
1237 	if (curr_gp.rcag_nctls <= ragp->rcag_nctls &&
1238 	    curr_gp.rcag_nvals <= ragp->rcag_nvals)
1239 		return (1);
1240 
1241 	return (0);
1242 }
1243 
1244 /*
1245  * rctl_set_t *rctl_set_dup(rctl_set_t *, struct proc *, struct proc *,
1246  *   rctl_set_t *, rctl_alloc_gp_t *, int)
1247  *
1248  * Overview
1249  *   Make a duplicate of the resource control set.  The proc pointers are those
1250  *   of the owning process and of the process associated with the entity
1251  *   receiving the duplicate.
1252  *
1253  *   Duplication is a 3 stage process. Stage 1 is memory allocation for
1254  *   the duplicate set, which is taken care of by rctl_set_dup_prealloc().
1255  *   Stage 2 consists of copying all rctls and values from the old set into
1256  *   the new. Stage 3 completes the duplication by performing the appropriate
1257  *   callbacks for each rctl in the new set.
1258  *
1259  *   Stages 2 and 3 are handled by calling rctl_set_dup with the RCD_DUP and
1260  *   RCD_CALLBACK functions, respectively. The RCD_CALLBACK flag may only
1261  *   be supplied if the newp proc structure reflects the new task and
1262  *   project linkage.
1263  *
1264  * Return value
1265  *   A pointer to the duplicate set.
1266  *
1267  * Caller's context
1268  *   The rcs_lock of the set to be duplicated must be held prior to entry.
1269  */
1270 rctl_set_t *
1271 rctl_set_dup(rctl_set_t *set, struct proc *oldp, struct proc *newp,
1272     rctl_entity_p_t *e, rctl_set_t *dup, rctl_alloc_gp_t *ragp, int flag)
1273 {
1274 	uint_t i;
1275 	rctl_set_t	*iter;
1276 
1277 	ASSERT((flag & RCD_DUP) || (flag & RCD_CALLBACK));
1278 	ASSERT(e);
1279 	/*
1280 	 * When copying the old set, iterate over that. Otherwise, when
1281 	 * only callbacks have been requested, iterate over the dup set.
1282 	 */
1283 	if (flag & RCD_DUP) {
1284 		ASSERT(MUTEX_HELD(&set->rcs_lock));
1285 		iter = set;
1286 		dup->rcs_entity = set->rcs_entity;
1287 	} else {
1288 		iter = dup;
1289 	}
1290 
1291 	mutex_enter(&dup->rcs_lock);
1292 
1293 	for (i = 0; i < rctl_set_size; i++) {
1294 		rctl_t *r = iter->rcs_ctls[i];
1295 		rctl_t *d;
1296 
1297 		while (r != NULL) {
1298 			if (flag & RCD_DUP) {
1299 				d = rctl_dup(r, ragp, oldp, newp);
1300 				rctl_set_insert(dup, r->rc_id, d);
1301 			} else {
1302 				d = r;
1303 			}
1304 
1305 			if (flag & RCD_CALLBACK)
1306 				RCTLOP_SET(d, newp, e,
1307 				    rctl_model_value(d->rc_dict_entry, newp,
1308 				    d->rc_cursor->rcv_value));
1309 
1310 			r = r->rc_next;
1311 		}
1312 	}
1313 
1314 	mutex_exit(&dup->rcs_lock);
1315 
1316 	return (dup);
1317 }
1318 
1319 /*
1320  * void rctl_set_free(rctl_set_t *)
1321  *
1322  * Overview
1323  *   Delete resource control set and all attached values.
1324  *
1325  * Return values
1326  *   No value returned.
1327  *
1328  * Caller's context
1329  *   No restrictions on context.
1330  */
1331 void
1332 rctl_set_free(rctl_set_t *set)
1333 {
1334 	uint_t i;
1335 
1336 	mutex_enter(&set->rcs_lock);
1337 	for (i = 0; i < rctl_set_size; i++) {
1338 		rctl_t *r = set->rcs_ctls[i];
1339 
1340 		while (r != NULL) {
1341 			rctl_val_t *v = r->rc_values;
1342 			rctl_t *n = r->rc_next;
1343 
1344 			kmem_cache_free(rctl_cache, r);
1345 
1346 			rctl_val_list_free(v);
1347 
1348 			r = n;
1349 		}
1350 	}
1351 	mutex_exit(&set->rcs_lock);
1352 
1353 	kmem_free(set->rcs_ctls, sizeof (rctl_t *) * rctl_set_size);
1354 	kmem_free(set, sizeof (rctl_set_t));
1355 }
1356 
1357 /*
1358  * void rctl_set_reset(rctl_set_t *)
1359  *
1360  * Overview
1361  *   Resets all rctls within the set such that the lowest value becomes active.
1362  *
1363  * Return values
1364  *   No value returned.
1365  *
1366  * Caller's context
1367  *   No restrictions on context.
1368  */
1369 void
1370 rctl_set_reset(rctl_set_t *set, struct proc *p, rctl_entity_p_t *e)
1371 {
1372 	uint_t i;
1373 
1374 	ASSERT(e);
1375 
1376 	mutex_enter(&set->rcs_lock);
1377 	for (i = 0; i < rctl_set_size; i++) {
1378 		rctl_t *r = set->rcs_ctls[i];
1379 
1380 		while (r != NULL) {
1381 			r->rc_cursor = r->rc_values;
1382 			rctl_val_list_reset(r->rc_cursor);
1383 			RCTLOP_SET(r, p, e, rctl_model_value(r->rc_dict_entry,
1384 			    p, r->rc_cursor->rcv_value));
1385 
1386 			ASSERT(r->rc_cursor != NULL);
1387 
1388 			r = r->rc_next;
1389 		}
1390 	}
1391 
1392 	mutex_exit(&set->rcs_lock);
1393 }
1394 
1395 /*
1396  * void rctl_set_tearoff(rctl_set *, struct proc *)
1397  *
1398  * Overview
1399  *   Tear off any resource control values on this set with an action recipient
1400  *   equal to the specified process (as they are becoming invalid with the
1401  *   process's departure from this set as an observer).
1402  *
1403  * Return values
1404  *   No value returned.
1405  *
1406  * Caller's context
1407  *   No restrictions on context
1408  */
1409 void
1410 rctl_set_tearoff(rctl_set_t *set, struct proc *p)
1411 {
1412 	uint_t i;
1413 
1414 	mutex_enter(&set->rcs_lock);
1415 	for (i = 0; i < rctl_set_size; i++) {
1416 		rctl_t *r = set->rcs_ctls[i];
1417 
1418 		while (r != NULL) {
1419 			rctl_val_t *rval;
1420 
1421 tearoff_rewalk_list:
1422 			rval = r->rc_values;
1423 
1424 			while (rval != NULL) {
1425 				if (rval->rcv_privilege == RCPRIV_BASIC &&
1426 				    rval->rcv_action_recipient == p) {
1427 					if (r->rc_cursor == rval)
1428 						r->rc_cursor = rval->rcv_next;
1429 
1430 					(void) rctl_val_list_delete(
1431 					    &r->rc_values, rval);
1432 
1433 					goto tearoff_rewalk_list;
1434 				}
1435 
1436 				rval = rval->rcv_next;
1437 			}
1438 
1439 			ASSERT(r->rc_cursor != NULL);
1440 
1441 			r = r->rc_next;
1442 		}
1443 	}
1444 
1445 	mutex_exit(&set->rcs_lock);
1446 }
1447 
1448 int
1449 rctl_set_find(rctl_set_t *set, rctl_hndl_t hndl, rctl_t **rctl)
1450 {
1451 	uint_t index = hndl % rctl_set_size;
1452 	rctl_t *curr_ctl;
1453 
1454 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1455 
1456 	for (curr_ctl = set->rcs_ctls[index]; curr_ctl != NULL;
1457 	    curr_ctl = curr_ctl->rc_next) {
1458 		if (curr_ctl->rc_id == hndl) {
1459 			*rctl = curr_ctl;
1460 
1461 			return (0);
1462 		}
1463 	}
1464 
1465 	return (-1);
1466 }
1467 
1468 /*
1469  * rlim64_t rctl_enforced_value(rctl_hndl_t, rctl_set_t *, struct proc *)
1470  *
1471  * Overview
1472  *   Given a process, get the next enforced value on the rctl of the specified
1473  *   handle.
1474  *
1475  * Return value
1476  *   The enforced value.
1477  *
1478  * Caller's context
1479  *   For controls on process collectives, p->p_lock must be held across the
1480  *   operation.
1481  */
1482 /*ARGSUSED*/
1483 rctl_qty_t
1484 rctl_enforced_value(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p)
1485 {
1486 	rctl_t *rctl;
1487 	rlim64_t ret;
1488 
1489 	mutex_enter(&rset->rcs_lock);
1490 
1491 	if (rctl_set_find(rset, hndl, &rctl) == -1)
1492 		panic("unknown resource control handle %d requested", hndl);
1493 	else
1494 		ret = rctl_model_value(rctl->rc_dict_entry, p,
1495 		    rctl->rc_cursor->rcv_value);
1496 
1497 	mutex_exit(&rset->rcs_lock);
1498 
1499 	return (ret);
1500 }
1501 
1502 /*
1503  * int rctl_global_get(const char *, rctl_dict_entry_t *)
1504  *
1505  * Overview
1506  *   Copy a sanitized version of the global rctl for a given resource control
1507  *   name.  (By sanitization, we mean that the unsafe data pointers have been
1508  *   zeroed.)
1509  *
1510  * Return value
1511  *   -1 if name not defined, 0 otherwise.
1512  *
1513  * Caller's context
1514  *   No restrictions on context.  rctl_dict_lock must not be held.
1515  */
1516 int
1517 rctl_global_get(const char *name, rctl_dict_entry_t *drde)
1518 {
1519 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1520 
1521 	if (rde == NULL)
1522 		return (-1);
1523 
1524 	bcopy(rde, drde, sizeof (rctl_dict_entry_t));
1525 
1526 	drde->rcd_next = NULL;
1527 	drde->rcd_ops = NULL;
1528 
1529 	return (0);
1530 }
1531 
1532 /*
1533  * int rctl_global_set(const char *, rctl_dict_entry_t *)
1534  *
1535  * Overview
1536  *   Transfer the settable fields of the named rctl to the global rctl matching
1537  *   the given resource control name.
1538  *
1539  * Return value
1540  *   -1 if name not defined, 0 otherwise.
1541  *
1542  * Caller's context
1543  *   No restrictions on context.  rctl_dict_lock must not be held.
1544  */
1545 int
1546 rctl_global_set(const char *name, rctl_dict_entry_t *drde)
1547 {
1548 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1549 
1550 	if (rde == NULL)
1551 		return (-1);
1552 
1553 	rde->rcd_flagaction = drde->rcd_flagaction;
1554 	rde->rcd_syslog_level = drde->rcd_syslog_level;
1555 	rde->rcd_strlog_flags = drde->rcd_strlog_flags;
1556 
1557 	return (0);
1558 }
1559 
1560 static int
1561 rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1562     int (*cbop)(rctl_hndl_t, struct proc *p, rctl_entity_p_t *e, rctl_t *,
1563     rctl_val_t *, rctl_val_t *), struct proc *p)
1564 {
1565 	rctl_t *rctl;
1566 	rctl_set_t *rset;
1567 	rctl_entity_p_t e;
1568 	int ret = 0;
1569 	rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
1570 
1571 local_op_retry:
1572 
1573 	ASSERT(MUTEX_HELD(&p->p_lock));
1574 
1575 	rset = rctl_entity_obtain_rset(rde, p);
1576 
1577 	if (rset == NULL) {
1578 		return (-1);
1579 	}
1580 	rctl_entity_obtain_entity_p(rset->rcs_entity, p, &e);
1581 
1582 	mutex_enter(&rset->rcs_lock);
1583 
1584 	/* using rctl's hndl, get rctl from local set */
1585 	if (rctl_set_find(rset, hndl, &rctl) == -1) {
1586 		mutex_exit(&rset->rcs_lock);
1587 		return (-1);
1588 	}
1589 
1590 	ret = cbop(hndl, p, &e, rctl, oval, nval);
1591 
1592 	mutex_exit(&rset->rcs_lock);
1593 	return (ret);
1594 }
1595 
1596 /*ARGSUSED*/
1597 static int
1598 rctl_local_get_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1599     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1600 {
1601 	if (oval == NULL) {
1602 		/*
1603 		 * RCTL_FIRST
1604 		 */
1605 		bcopy(rctl->rc_values, nval, sizeof (rctl_val_t));
1606 	} else {
1607 		/*
1608 		 * RCTL_NEXT
1609 		 */
1610 		rctl_val_t *tval = rctl_val_list_find(&rctl->rc_values, oval);
1611 
1612 		if (tval == NULL)
1613 			return (ESRCH);
1614 		else if (tval->rcv_next == NULL)
1615 			return (ENOENT);
1616 		else
1617 			bcopy(tval->rcv_next, nval, sizeof (rctl_val_t));
1618 	}
1619 
1620 	return (0);
1621 }
1622 
1623 /*
1624  * int rctl_local_get(rctl_hndl_t, rctl_val_t *)
1625  *
1626  * Overview
1627  *   Get the rctl value for the given flags.
1628  *
1629  * Return values
1630  *   0 for successful get, errno otherwise.
1631  */
1632 int
1633 rctl_local_get(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1634     struct proc *p)
1635 {
1636 	return (rctl_local_op(hndl, oval, nval, rctl_local_get_cb, p));
1637 }
1638 
1639 /*ARGSUSED*/
1640 static int
1641 rctl_local_delete_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1642     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1643 {
1644 	if ((oval = rctl_val_list_find(&rctl->rc_values, nval)) == NULL)
1645 		return (ESRCH);
1646 
1647 	if (rctl->rc_cursor == oval) {
1648 		rctl->rc_cursor = oval->rcv_next;
1649 		rctl_val_list_reset(rctl->rc_cursor);
1650 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1651 		    rctl->rc_cursor->rcv_value));
1652 
1653 		ASSERT(rctl->rc_cursor != NULL);
1654 	}
1655 
1656 	(void) rctl_val_list_delete(&rctl->rc_values, oval);
1657 
1658 	return (0);
1659 }
1660 
1661 /*
1662  * int rctl_local_delete(rctl_hndl_t, rctl_val_t *)
1663  *
1664  * Overview
1665  *   Delete the rctl value for the given flags.
1666  *
1667  * Return values
1668  *   0 for successful delete, errno otherwise.
1669  */
1670 int
1671 rctl_local_delete(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1672 {
1673 	return (rctl_local_op(hndl, NULL, val, rctl_local_delete_cb, p));
1674 }
1675 
1676 /*
1677  * rctl_local_insert_cb()
1678  *
1679  * Overview
1680  *   Insert a new value into the rctl's val list. If an error occurs,
1681  *   the val list must be left in the same state as when the function
1682  *   was entered.
1683  *
1684  * Return Values
1685  *   0 for successful insert, EINVAL if the value is duplicated in the
1686  *   existing list.
1687  */
1688 /*ARGSUSED*/
1689 static int
1690 rctl_local_insert_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1691     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1692 {
1693 	/*
1694 	 * Before inserting, confirm there are no duplicates of this value
1695 	 * and flag level. If there is a duplicate, flag an error and do
1696 	 * nothing.
1697 	 */
1698 	if (rctl_val_list_insert(&rctl->rc_values, nval) != 0)
1699 		return (EINVAL);
1700 
1701 	if (rctl_val_cmp(nval, rctl->rc_cursor, 0) < 0) {
1702 		rctl->rc_cursor = nval;
1703 		rctl_val_list_reset(rctl->rc_cursor);
1704 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1705 		    rctl->rc_cursor->rcv_value));
1706 
1707 		ASSERT(rctl->rc_cursor != NULL);
1708 	}
1709 
1710 	return (0);
1711 }
1712 
1713 /*
1714  * int rctl_local_insert(rctl_hndl_t, rctl_val_t *)
1715  *
1716  * Overview
1717  *   Insert the rctl value into the appropriate rctl set for the calling
1718  *   process, given the handle.
1719  */
1720 int
1721 rctl_local_insert(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1722 {
1723 	return (rctl_local_op(hndl, NULL, val, rctl_local_insert_cb, p));
1724 }
1725 
1726 /*
1727  * rctl_local_insert_all_cb()
1728  *
1729  * Overview
1730  *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1731  *
1732  *   Inserts new values from the project database (new_values).  alloc_values
1733  *   should be a linked list of pre-allocated rctl_val_t, which are used to
1734  *   populate (rc_projdb).
1735  *
1736  *   Should the *new_values linked list match the contents of the rctl's
1737  *   rp_projdb then we do nothing.
1738  *
1739  * Return Values
1740  *   0 is always returned.
1741  */
1742 /*ARGSUSED*/
1743 static int
1744 rctl_local_insert_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1745     rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1746 {
1747 	rctl_val_t *val;
1748 	rctl_val_t *tmp_val;
1749 	rctl_val_t *next;
1750 	int modified = 0;
1751 
1752 	/*
1753 	 * If this the first time we've set this project rctl, then we delete
1754 	 * all the privilege values.  These privilege values have been set by
1755 	 * rctl_add_default_limit().
1756 	 *
1757 	 * We save some cycles here by not calling rctl_val_list_delete().
1758 	 */
1759 	if (rctl->rc_projdb == NULL) {
1760 		val = rctl->rc_values;
1761 
1762 		while (val != NULL) {
1763 			if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1764 				if (val->rcv_prev != NULL)
1765 					val->rcv_prev->rcv_next = val->rcv_next;
1766 				else
1767 					rctl->rc_values = val->rcv_next;
1768 
1769 				if (val->rcv_next != NULL)
1770 					val->rcv_next->rcv_prev = val->rcv_prev;
1771 
1772 				tmp_val = val;
1773 				val = val->rcv_next;
1774 				kmem_cache_free(rctl_val_cache, tmp_val);
1775 			} else {
1776 				val = val->rcv_next;
1777 			}
1778 		}
1779 		modified = 1;
1780 	}
1781 
1782 	/*
1783 	 * Delete active values previously set through the project database.
1784 	 */
1785 	val = rctl->rc_projdb;
1786 
1787 	while (val != NULL) {
1788 
1789 		/* Is the old value found in the new values? */
1790 		if (rctl_val_list_find(&new_values, val) == NULL) {
1791 
1792 			/*
1793 			 * Delete from the active values if it originated from
1794 			 * the project database.
1795 			 */
1796 			if (((tmp_val = rctl_val_list_find(&rctl->rc_values,
1797 			    val)) != NULL) &&
1798 			    (tmp_val->rcv_flagaction & RCTL_LOCAL_PROJDB)) {
1799 				(void) rctl_val_list_delete(&rctl->rc_values,
1800 				    tmp_val);
1801 			}
1802 
1803 			tmp_val = val->rcv_next;
1804 			(void) rctl_val_list_delete(&rctl->rc_projdb, val);
1805 			val = tmp_val;
1806 			modified = 1;
1807 
1808 		} else
1809 			val = val->rcv_next;
1810 	}
1811 
1812 	/*
1813 	 * Insert new values from the project database.
1814 	 */
1815 	while (new_values != NULL) {
1816 		next = new_values->rcv_next;
1817 
1818 		/*
1819 		 * Insert this new value into the rc_projdb, and duplicate this
1820 		 * entry to the active list.
1821 		 */
1822 		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1823 
1824 			tmp_val = alloc_values->rcv_next;
1825 			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1826 			alloc_values->rcv_next = tmp_val;
1827 
1828 			if (rctl_val_list_insert(&rctl->rc_values,
1829 			    alloc_values) == 0) {
1830 				/* inserted move alloc_values on */
1831 				alloc_values = tmp_val;
1832 				modified = 1;
1833 			}
1834 		} else {
1835 			/*
1836 			 * Unlike setrctl() we don't want to return an error on
1837 			 * a duplicate entry; we are concerned solely with
1838 			 * ensuring that all the values specified are set.
1839 			 */
1840 			kmem_cache_free(rctl_val_cache, new_values);
1841 		}
1842 		new_values = next;
1843 	}
1844 
1845 	/* Teardown any unused rctl_val_t */
1846 	while (alloc_values != NULL) {
1847 		tmp_val = alloc_values;
1848 		alloc_values = alloc_values->rcv_next;
1849 		kmem_cache_free(rctl_val_cache, tmp_val);
1850 	}
1851 
1852 	/* Reset the cursor if rctl values have been modified */
1853 	if (modified) {
1854 		rctl->rc_cursor = rctl->rc_values;
1855 		rctl_val_list_reset(rctl->rc_cursor);
1856 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1857 		    rctl->rc_cursor->rcv_value));
1858 	}
1859 
1860 	return (0);
1861 }
1862 
1863 int
1864 rctl_local_insert_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1865     rctl_val_t *alloc_values, struct proc *p)
1866 {
1867 	return (rctl_local_op(hndl, new_values, alloc_values,
1868 	    rctl_local_insert_all_cb, p));
1869 }
1870 
1871 /*
1872  * rctl_local_replace_all_cb()
1873  *
1874  * Overview
1875  *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1876  *
1877  *   Clears the active rctl values (rc_values), and stored values from the
1878  *   previous insertions from the project database (rc_projdb).
1879  *
1880  *   Inserts new values from the project database (new_values).  alloc_values
1881  *   should be a linked list of pre-allocated rctl_val_t, which are used to
1882  *   populate (rc_projdb).
1883  *
1884  * Return Values
1885  *   0 is always returned.
1886  */
1887 /*ARGSUSED*/
1888 static int
1889 rctl_local_replace_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1890     rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1891 {
1892 	rctl_val_t *val;
1893 	rctl_val_t *next;
1894 	rctl_val_t *tmp_val;
1895 
1896 	/* Delete all the privilege vaules */
1897 	val = rctl->rc_values;
1898 
1899 	while (val != NULL) {
1900 		if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1901 			if (val->rcv_prev != NULL)
1902 				val->rcv_prev->rcv_next = val->rcv_next;
1903 			else
1904 				rctl->rc_values = val->rcv_next;
1905 
1906 			if (val->rcv_next != NULL)
1907 				val->rcv_next->rcv_prev = val->rcv_prev;
1908 
1909 			tmp_val = val;
1910 			val = val->rcv_next;
1911 			kmem_cache_free(rctl_val_cache, tmp_val);
1912 		} else {
1913 			val = val->rcv_next;
1914 		}
1915 	}
1916 
1917 	/* Delete the contents of rc_projdb */
1918 	val = rctl->rc_projdb;
1919 	while (val != NULL) {
1920 
1921 		tmp_val = val;
1922 		val = val->rcv_next;
1923 		kmem_cache_free(rctl_val_cache, tmp_val);
1924 	}
1925 	rctl->rc_projdb = NULL;
1926 
1927 	/*
1928 	 * Insert new values from the project database.
1929 	 */
1930 	while (new_values != NULL) {
1931 		next = new_values->rcv_next;
1932 
1933 		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1934 			tmp_val = alloc_values->rcv_next;
1935 			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1936 			alloc_values->rcv_next = tmp_val;
1937 
1938 			if (rctl_val_list_insert(&rctl->rc_values,
1939 			    alloc_values) == 0) {
1940 				/* inserted, so move alloc_values on */
1941 				alloc_values = tmp_val;
1942 			}
1943 		} else {
1944 			/*
1945 			 * Unlike setrctl() we don't want to return an error on
1946 			 * a duplicate entry; we are concerned solely with
1947 			 * ensuring that all the values specified are set.
1948 			 */
1949 			kmem_cache_free(rctl_val_cache, new_values);
1950 		}
1951 
1952 		new_values = next;
1953 	}
1954 
1955 	/* Teardown any unused rctl_val_t */
1956 	while (alloc_values != NULL) {
1957 		tmp_val = alloc_values;
1958 		alloc_values = alloc_values->rcv_next;
1959 		kmem_cache_free(rctl_val_cache, tmp_val);
1960 	}
1961 
1962 	/* Always reset the cursor */
1963 	rctl->rc_cursor = rctl->rc_values;
1964 	rctl_val_list_reset(rctl->rc_cursor);
1965 	RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1966 	    rctl->rc_cursor->rcv_value));
1967 
1968 	return (0);
1969 }
1970 
1971 int
1972 rctl_local_replace_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1973     rctl_val_t *alloc_values, struct proc *p)
1974 {
1975 	return (rctl_local_op(hndl, new_values, alloc_values,
1976 	    rctl_local_replace_all_cb, p));
1977 }
1978 
1979 static int
1980 rctl_local_replace_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1981     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1982 {
1983 	int ret;
1984 	rctl_val_t *tmp;
1985 
1986 	/* Verify that old will be delete-able */
1987 	tmp = rctl_val_list_find(&rctl->rc_values, oval);
1988 	if (tmp == NULL)
1989 		return (ESRCH);
1990 	/*
1991 	 * Caller should verify that value being deleted is not the
1992 	 * system value.
1993 	 */
1994 	ASSERT(tmp->rcv_privilege != RCPRIV_SYSTEM);
1995 
1996 	/*
1997 	 * rctl_local_insert_cb() does the job of flagging an error
1998 	 * for any duplicate values. So, call rctl_local_insert_cb()
1999 	 * for the new value first, then do deletion of the old value.
2000 	 * Since this is a callback function to rctl_local_op, we can
2001 	 * count on rcs_lock being held at this point. This guarantees
2002 	 * that there is at no point a visible list which contains both
2003 	 * new and old values.
2004 	 */
2005 	if (ret = rctl_local_insert_cb(hndl, p, e, rctl, NULL, nval))
2006 		return (ret);
2007 
2008 	ret = rctl_local_delete_cb(hndl, p, e, rctl, NULL, oval);
2009 	ASSERT(ret == 0);
2010 	return (0);
2011 }
2012 
2013 /*
2014  * int rctl_local_replace(rctl_hndl_t, void *, int, uint64_t *)
2015  *
2016  * Overview
2017  *   Replace the rctl value with a new one.
2018  *
2019  * Return values
2020  *   0 for successful replace, errno otherwise.
2021  */
2022 int
2023 rctl_local_replace(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
2024     struct proc *p)
2025 {
2026 	return (rctl_local_op(hndl, oval, nval, rctl_local_replace_cb, p));
2027 }
2028 
2029 /*
2030  * int rctl_rlimit_get(rctl_hndl_t, struct proc *, struct rlimit64 *)
2031  *
2032  * Overview
2033  *   To support rlimit compatibility, we need a function which takes a 64-bit
2034  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2035  *   This operation is only intended for legacy rlimits.
2036  */
2037 int
2038 rctl_rlimit_get(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64)
2039 {
2040 	rctl_t *rctl;
2041 	rctl_val_t *rval;
2042 	rctl_set_t *rset = p->p_rctls;
2043 	int soft_limit_seen = 0;
2044 	int test_for_deny = 1;
2045 
2046 	mutex_enter(&rset->rcs_lock);
2047 	if (rctl_set_find(rset, rc, &rctl) == -1) {
2048 		mutex_exit(&rset->rcs_lock);
2049 		return (-1);
2050 	}
2051 
2052 	rval = rctl->rc_values;
2053 
2054 	if (rctl->rc_dict_entry->rcd_flagaction & (RCTL_GLOBAL_DENY_NEVER |
2055 	    RCTL_GLOBAL_DENY_ALWAYS))
2056 		test_for_deny = 0;
2057 
2058 	/*
2059 	 * 1.  Find the first control value with the RCTL_LOCAL_DENY bit set.
2060 	 */
2061 	while (rval != NULL && rval->rcv_privilege != RCPRIV_SYSTEM) {
2062 		if (test_for_deny &&
2063 		    (rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0) {
2064 			rval = rval->rcv_next;
2065 			continue;
2066 		}
2067 
2068 		/*
2069 		 * 2.  If this is an RCPRIV_BASIC value, then we've found the
2070 		 * effective soft limit and should set rlim_cur.  We should then
2071 		 * continue looking for another control value with the DENY bit
2072 		 * set.
2073 		 */
2074 		if (rval->rcv_privilege == RCPRIV_BASIC) {
2075 			if (soft_limit_seen) {
2076 				rval = rval->rcv_next;
2077 				continue;
2078 			}
2079 
2080 			if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2081 			    rval->rcv_value < rctl_model_maximum(
2082 			    rctl->rc_dict_entry, p))
2083 				rlp64->rlim_cur = rval->rcv_value;
2084 			else
2085 				rlp64->rlim_cur = RLIM64_INFINITY;
2086 			soft_limit_seen = 1;
2087 
2088 			rval = rval->rcv_next;
2089 			continue;
2090 		}
2091 
2092 		/*
2093 		 * 3.  This is an RCPRIV_PRIVILEGED value.  If we haven't found
2094 		 * a soft limit candidate, then we've found the effective hard
2095 		 * and soft limits and should set both  If we had found a soft
2096 		 * limit, then this is only the hard limit and we need only set
2097 		 * rlim_max.
2098 		 */
2099 		if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2100 		    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry,
2101 		    p))
2102 			rlp64->rlim_max = rval->rcv_value;
2103 		else
2104 			rlp64->rlim_max = RLIM64_INFINITY;
2105 		if (!soft_limit_seen)
2106 			rlp64->rlim_cur = rlp64->rlim_max;
2107 
2108 		mutex_exit(&rset->rcs_lock);
2109 		return (0);
2110 	}
2111 
2112 	if (rval == NULL) {
2113 		/*
2114 		 * This control sequence is corrupt, as it is not terminated by
2115 		 * a system privileged control value.
2116 		 */
2117 		mutex_exit(&rset->rcs_lock);
2118 		return (-1);
2119 	}
2120 
2121 	/*
2122 	 * 4.  If we run into a RCPRIV_SYSTEM value, then the hard limit (and
2123 	 * the soft, if we haven't a soft candidate) should be the value of the
2124 	 * system control value.
2125 	 */
2126 	if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2127 	    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry, p))
2128 		rlp64->rlim_max = rval->rcv_value;
2129 	else
2130 		rlp64->rlim_max = RLIM64_INFINITY;
2131 
2132 	if (!soft_limit_seen)
2133 		rlp64->rlim_cur = rlp64->rlim_max;
2134 
2135 	mutex_exit(&rset->rcs_lock);
2136 	return (0);
2137 }
2138 
2139 /*
2140  * rctl_alloc_gp_t *rctl_rlimit_set_prealloc(uint_t)
2141  *
2142  * Overview
2143  *   Before making a series of calls to rctl_rlimit_set(), we must have a
2144  *   preallocated batch of resource control values, as rctl_rlimit_set() can
2145  *   potentially consume two resource control values per call.
2146  *
2147  * Return values
2148  *   A populated resource control allocation group with 2n resource control
2149  *   values.
2150  *
2151  * Caller's context
2152  *   Must be safe for KM_SLEEP allocations.
2153  */
2154 rctl_alloc_gp_t *
2155 rctl_rlimit_set_prealloc(uint_t n)
2156 {
2157 	rctl_alloc_gp_t *gp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
2158 
2159 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
2160 
2161 	gp->rcag_nvals = 2 * n;
2162 
2163 	rctl_gp_alloc(gp);
2164 
2165 	return (gp);
2166 }
2167 
2168 /*
2169  * int rctl_rlimit_set(rctl_hndl_t, struct proc *, struct rlimit64 *, int,
2170  *   int)
2171  *
2172  * Overview
2173  *   To support rlimit compatibility, we need a function which takes a 64-bit
2174  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2175  *   This operation is only intended for legacy rlimits.
2176  *
2177  *   The implementation of rctl_rlimit_set() is a bit clever, as it tries to
2178  *   minimize the number of values placed on the value sequence in various
2179  *   cases.  Furthermore, we don't allow multiple identical privilege-action
2180  *   values on the same sequence.  (That is, we don't want a sequence like
2181  *   "while (1) { rlim.rlim_cur++; setrlimit(..., rlim); }" to exhaust kernel
2182  *   memory.)  So we want to delete any values with the same privilege value and
2183  *   action.
2184  *
2185  * Return values
2186  *   0 for successful set, errno otherwise. Errno will be either EINVAL
2187  *   or EPERM, in keeping with defined errnos for ulimit() and setrlimit()
2188  *   system calls.
2189  */
2190 /*ARGSUSED*/
2191 int
2192 rctl_rlimit_set(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64,
2193     rctl_alloc_gp_t *ragp, int flagaction, int signal, const cred_t *cr)
2194 {
2195 	rctl_t *rctl;
2196 	rctl_val_t *rval, *rval_priv, *rval_basic;
2197 	rctl_set_t *rset = p->p_rctls;
2198 	rctl_qty_t max;
2199 	rctl_entity_p_t e;
2200 	struct rlimit64 cur_rl;
2201 
2202 	e.rcep_t = RCENTITY_PROCESS;
2203 	e.rcep_p.proc = p;
2204 
2205 	if (rlp64->rlim_cur > rlp64->rlim_max)
2206 		return (EINVAL);
2207 
2208 	if (rctl_rlimit_get(rc, p, &cur_rl) == -1)
2209 		return (EINVAL);
2210 
2211 	/*
2212 	 * If we are not privileged, we can only lower the hard limit.
2213 	 */
2214 	if ((rlp64->rlim_max > cur_rl.rlim_max) &&
2215 	    cur_rl.rlim_max != RLIM64_INFINITY &&
2216 	    secpolicy_resource(cr) != 0)
2217 		return (EPERM);
2218 
2219 	mutex_enter(&rset->rcs_lock);
2220 
2221 	if (rctl_set_find(rset, rc, &rctl) == -1) {
2222 		mutex_exit(&rset->rcs_lock);
2223 		return (EINVAL);
2224 	}
2225 
2226 	rval_priv = rctl_gp_detach_val(ragp);
2227 
2228 	rval = rctl->rc_values;
2229 
2230 	while (rval != NULL) {
2231 		rctl_val_t *next = rval->rcv_next;
2232 
2233 		if (rval->rcv_privilege == RCPRIV_SYSTEM)
2234 			break;
2235 
2236 		if ((rval->rcv_privilege == RCPRIV_BASIC) ||
2237 		    (rval->rcv_flagaction & ~RCTL_LOCAL_ACTION_MASK) ==
2238 		    (flagaction & ~RCTL_LOCAL_ACTION_MASK)) {
2239 			if (rctl->rc_cursor == rval) {
2240 				rctl->rc_cursor = rval->rcv_next;
2241 				rctl_val_list_reset(rctl->rc_cursor);
2242 				RCTLOP_SET(rctl, p, &e, rctl_model_value(
2243 				    rctl->rc_dict_entry, p,
2244 				    rctl->rc_cursor->rcv_value));
2245 			}
2246 			(void) rctl_val_list_delete(&rctl->rc_values, rval);
2247 		}
2248 
2249 		rval = next;
2250 	}
2251 
2252 	rval_priv->rcv_privilege = RCPRIV_PRIVILEGED;
2253 	rval_priv->rcv_flagaction = flagaction;
2254 	if (rlp64->rlim_max == RLIM64_INFINITY) {
2255 		rval_priv->rcv_flagaction |= RCTL_LOCAL_MAXIMAL;
2256 		max = rctl->rc_dict_entry->rcd_max_native;
2257 	} else {
2258 		max = rlp64->rlim_max;
2259 	}
2260 	rval_priv->rcv_value = max;
2261 	rval_priv->rcv_action_signal = signal;
2262 	rval_priv->rcv_action_recipient = NULL;
2263 	rval_priv->rcv_action_recip_pid = -1;
2264 	rval_priv->rcv_firing_time = 0;
2265 	rval_priv->rcv_prev = rval_priv->rcv_next = NULL;
2266 
2267 	(void) rctl_val_list_insert(&rctl->rc_values, rval_priv);
2268 	rctl->rc_cursor = rval_priv;
2269 	rctl_val_list_reset(rctl->rc_cursor);
2270 	RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2271 	    rctl->rc_cursor->rcv_value));
2272 
2273 	if (rlp64->rlim_cur != RLIM64_INFINITY && rlp64->rlim_cur < max) {
2274 		rval_basic = rctl_gp_detach_val(ragp);
2275 
2276 		rval_basic->rcv_privilege = RCPRIV_BASIC;
2277 		rval_basic->rcv_value = rlp64->rlim_cur;
2278 		rval_basic->rcv_flagaction = flagaction;
2279 		rval_basic->rcv_action_signal = signal;
2280 		rval_basic->rcv_action_recipient = p;
2281 		rval_basic->rcv_action_recip_pid = p->p_pid;
2282 		rval_basic->rcv_firing_time = 0;
2283 		rval_basic->rcv_prev = rval_basic->rcv_next = NULL;
2284 
2285 		(void) rctl_val_list_insert(&rctl->rc_values, rval_basic);
2286 		rctl->rc_cursor = rval_basic;
2287 		rctl_val_list_reset(rctl->rc_cursor);
2288 		RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2289 		    rctl->rc_cursor->rcv_value));
2290 	}
2291 
2292 	ASSERT(rctl->rc_cursor != NULL);
2293 
2294 	mutex_exit(&rset->rcs_lock);
2295 	return (0);
2296 }
2297 
2298 
2299 /*
2300  * rctl_hndl_t rctl_register(const char *, rctl_entity_t, int, rlim64_t,
2301  *   rlim64_t, rctl_ops_t *)
2302  *
2303  * Overview
2304  *   rctl_register() performs a look-up in the dictionary of rctls
2305  *   active on the system; if a rctl of that name is absent, an entry is
2306  *   made into the dictionary.  The rctl is returned with its reference
2307  *   count incremented by one.  If the rctl name already exists, we panic.
2308  *   (Were the resource control system to support dynamic loading and unloading,
2309  *   which it is structured for, duplicate registration should lead to load
2310  *   failure instead of panicking.)
2311  *
2312  *   Each registered rctl has a requirement that a RCPRIV_SYSTEM limit be
2313  *   defined.  This limit contains the highest possible value for this quantity
2314  *   on the system.  Furthermore, the registered control must provide infinite
2315  *   values for all applicable address space models supported by the operating
2316  *   system.  Attempts to set resource control values beyond the system limit
2317  *   will fail.
2318  *
2319  * Return values
2320  *   The rctl's ID.
2321  *
2322  * Caller's context
2323  *   Caller must be in a context suitable for KM_SLEEP allocations.
2324  */
2325 rctl_hndl_t
2326 rctl_register(
2327     const char *name,
2328     rctl_entity_t entity,
2329     int global_flags,
2330     rlim64_t max_native,
2331     rlim64_t max_ilp32,
2332     rctl_ops_t *ops)
2333 {
2334 	rctl_t *rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
2335 	rctl_val_t *rctl_val = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2336 	rctl_dict_entry_t *rctl_de = kmem_zalloc(sizeof (rctl_dict_entry_t),
2337 	    KM_SLEEP);
2338 	rctl_t *old_rctl;
2339 	rctl_hndl_t rhndl;
2340 	int localflags;
2341 
2342 	ASSERT(ops != NULL);
2343 
2344 	bzero(rctl, sizeof (rctl_t));
2345 	bzero(rctl_val, sizeof (rctl_val_t));
2346 
2347 	if (global_flags & RCTL_GLOBAL_DENY_NEVER)
2348 		localflags = RCTL_LOCAL_MAXIMAL;
2349 	else
2350 		localflags = RCTL_LOCAL_MAXIMAL | RCTL_LOCAL_DENY;
2351 
2352 	rctl_val->rcv_privilege = RCPRIV_SYSTEM;
2353 	rctl_val->rcv_value = max_native;
2354 	rctl_val->rcv_flagaction = localflags;
2355 	rctl_val->rcv_action_signal = 0;
2356 	rctl_val->rcv_action_recipient = NULL;
2357 	rctl_val->rcv_action_recip_pid = -1;
2358 	rctl_val->rcv_firing_time = 0;
2359 	rctl_val->rcv_next = NULL;
2360 	rctl_val->rcv_prev = NULL;
2361 
2362 	rctl_de->rcd_name = (char *)name;
2363 	rctl_de->rcd_default_value = rctl_val;
2364 	rctl_de->rcd_max_native = max_native;
2365 	rctl_de->rcd_max_ilp32 = max_ilp32;
2366 	rctl_de->rcd_entity = entity;
2367 	rctl_de->rcd_ops = ops;
2368 	rctl_de->rcd_flagaction = global_flags;
2369 
2370 	rctl->rc_dict_entry = rctl_de;
2371 	rctl->rc_values = rctl_val;
2372 
2373 	/*
2374 	 * 1.  Take global lock, validate nonexistence of name, get ID.
2375 	 */
2376 	mutex_enter(&rctl_dict_lock);
2377 
2378 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
2379 	    (mod_hash_val_t *)&rhndl) != MH_ERR_NOTFOUND)
2380 		panic("duplicate registration of rctl %s", name);
2381 
2382 	rhndl = rctl_de->rcd_id = rctl->rc_id =
2383 	    (rctl_hndl_t)id_alloc(rctl_ids);
2384 
2385 	/*
2386 	 * 2.  Insert name-entry pair in rctl_dict_by_name.
2387 	 */
2388 	if (mod_hash_insert(rctl_dict_by_name, (mod_hash_key_t)name,
2389 	    (mod_hash_val_t)rctl_de))
2390 		panic("unable to insert rctl dict entry for %s (%u)", name,
2391 		    (uint_t)rctl->rc_id);
2392 
2393 	/*
2394 	 * 3.  Insert ID-rctl_t * pair in rctl_dict.
2395 	 */
2396 	if (mod_hash_find(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2397 	    (mod_hash_val_t *)&old_rctl) != MH_ERR_NOTFOUND)
2398 		panic("duplicate rctl ID %u registered", rctl->rc_id);
2399 
2400 	if (mod_hash_insert(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2401 	    (mod_hash_val_t)rctl))
2402 		panic("unable to insert rctl %s/%u (%p)", name,
2403 		    (uint_t)rctl->rc_id, (void *)rctl);
2404 
2405 	/*
2406 	 * 3a. Insert rctl_dict_entry_t * in appropriate entity list.
2407 	 */
2408 
2409 	mutex_enter(&rctl_lists_lock);
2410 
2411 	switch (entity) {
2412 	case RCENTITY_ZONE:
2413 	case RCENTITY_PROJECT:
2414 	case RCENTITY_TASK:
2415 	case RCENTITY_PROCESS:
2416 		rctl_de->rcd_next = rctl_lists[entity];
2417 		rctl_lists[entity] = rctl_de;
2418 		break;
2419 	default:
2420 		panic("registering unknown rctl entity %d (%s)", entity,
2421 		    name);
2422 		break;
2423 	}
2424 
2425 	mutex_exit(&rctl_lists_lock);
2426 
2427 	/*
2428 	 * 4.  Drop lock.
2429 	 */
2430 	mutex_exit(&rctl_dict_lock);
2431 
2432 	return (rhndl);
2433 }
2434 
2435 /*
2436  * static int rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p,
2437  *    rctl_val_t *v)
2438  *
2439  * Overview
2440  *   rctl_global_action() takes, in according with the flags on the rctl_dict
2441  *   entry for the given control, the appropriate actions on the exceeded
2442  *   control value.  Additionally, rctl_global_action() updates the firing time
2443  *   on the exceeded value.
2444  *
2445  * Return values
2446  *   A bitmask reflecting the actions actually taken.
2447  *
2448  * Caller's context
2449  *   No restrictions on context.
2450  */
2451 /*ARGSUSED*/
2452 static int
2453 rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v)
2454 {
2455 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2456 	const char *pr, *en, *idstr;
2457 	id_t id;
2458 	enum {
2459 		SUFFIX_NONE,	/* id consumed directly */
2460 		SUFFIX_NUMERIC,	/* id consumed in suffix */
2461 		SUFFIX_STRING	/* idstr consumed in suffix */
2462 	} suffix = SUFFIX_NONE;
2463 	int ret = 0;
2464 
2465 	v->rcv_firing_time = gethrtime();
2466 
2467 	switch (v->rcv_privilege) {
2468 	case RCPRIV_BASIC:
2469 		pr = "basic";
2470 		break;
2471 	case RCPRIV_PRIVILEGED:
2472 		pr = "privileged";
2473 		break;
2474 	case RCPRIV_SYSTEM:
2475 		pr = "system";
2476 		break;
2477 	default:
2478 		pr = "unknown";
2479 		break;
2480 	}
2481 
2482 	switch (rde->rcd_entity) {
2483 	case RCENTITY_PROCESS:
2484 		en = "process";
2485 		id = p->p_pid;
2486 		suffix = SUFFIX_NONE;
2487 		break;
2488 	case RCENTITY_TASK:
2489 		en = "task";
2490 		id = p->p_task->tk_tkid;
2491 		suffix = SUFFIX_NUMERIC;
2492 		break;
2493 	case RCENTITY_PROJECT:
2494 		en = "project";
2495 		id = p->p_task->tk_proj->kpj_id;
2496 		suffix = SUFFIX_NUMERIC;
2497 		break;
2498 	case RCENTITY_ZONE:
2499 		en = "zone";
2500 		idstr = p->p_zone->zone_name;
2501 		suffix = SUFFIX_STRING;
2502 		break;
2503 	default:
2504 		en = "unknown entity associated with process";
2505 		id = p->p_pid;
2506 		suffix = SUFFIX_NONE;
2507 		break;
2508 	}
2509 
2510 	if (rde->rcd_flagaction & RCTL_GLOBAL_SYSLOG) {
2511 		switch (suffix) {
2512 		default:
2513 		case SUFFIX_NONE:
2514 			(void) strlog(0, 0, 0,
2515 			    rde->rcd_strlog_flags | log_global.lz_active,
2516 			    "%s rctl %s (value %llu) exceeded by %s %d.",
2517 			    pr, rde->rcd_name, v->rcv_value, en, id);
2518 			break;
2519 		case SUFFIX_NUMERIC:
2520 			(void) strlog(0, 0, 0,
2521 			    rde->rcd_strlog_flags | log_global.lz_active,
2522 			    "%s rctl %s (value %llu) exceeded by process %d"
2523 			    " in %s %d.",
2524 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2525 			    en, id);
2526 			break;
2527 		case SUFFIX_STRING:
2528 			(void) strlog(0, 0, 0,
2529 			    rde->rcd_strlog_flags | log_global.lz_active,
2530 			    "%s rctl %s (value %llu) exceeded by process %d"
2531 			    " in %s %s.",
2532 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2533 			    en, idstr);
2534 			break;
2535 		}
2536 	}
2537 
2538 	if (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS)
2539 		ret |= RCT_DENY;
2540 
2541 	return (ret);
2542 }
2543 
2544 static int
2545 rctl_local_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v,
2546     uint_t safety)
2547 {
2548 	int ret = 0;
2549 	sigqueue_t *sqp = NULL;
2550 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2551 	int unobservable = (rde->rcd_flagaction & RCTL_GLOBAL_UNOBSERVABLE);
2552 
2553 	proc_t *recipient = v->rcv_action_recipient;
2554 	id_t recip_pid = v->rcv_action_recip_pid;
2555 	int recip_signal = v->rcv_action_signal;
2556 	uint_t flagaction = v->rcv_flagaction;
2557 
2558 	if (safety == RCA_UNSAFE_ALL) {
2559 		if (flagaction & RCTL_LOCAL_DENY) {
2560 			ret |= RCT_DENY;
2561 		}
2562 		return (ret);
2563 	}
2564 
2565 	if (flagaction & RCTL_LOCAL_SIGNAL) {
2566 		/*
2567 		 * We can build a siginfo only in the case that it is
2568 		 * safe for us to drop p_lock.  (For asynchronous
2569 		 * checks this is currently not true.)
2570 		 */
2571 		if (safety == RCA_SAFE) {
2572 			mutex_exit(&rset->rcs_lock);
2573 			mutex_exit(&p->p_lock);
2574 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
2575 			mutex_enter(&p->p_lock);
2576 			mutex_enter(&rset->rcs_lock);
2577 
2578 			sqp->sq_info.si_signo = recip_signal;
2579 			sqp->sq_info.si_code = SI_RCTL;
2580 			sqp->sq_info.si_errno = 0;
2581 			sqp->sq_info.si_entity = (int)rde->rcd_entity;
2582 		}
2583 
2584 		if (recipient == NULL || recipient == p) {
2585 			ret |= RCT_SIGNAL;
2586 
2587 			if (sqp == NULL) {
2588 				sigtoproc(p, NULL, recip_signal);
2589 			} else if (p == curproc) {
2590 				/*
2591 				 * Then this is a synchronous test and we can
2592 				 * direct the signal at the violating thread.
2593 				 */
2594 				sigaddqa(curproc, curthread, sqp);
2595 			} else {
2596 				sigaddqa(p, NULL, sqp);
2597 			}
2598 		} else if (!unobservable) {
2599 			proc_t *rp;
2600 
2601 			mutex_exit(&rset->rcs_lock);
2602 			mutex_exit(&p->p_lock);
2603 
2604 			mutex_enter(&pidlock);
2605 			if ((rp = prfind(recip_pid)) == recipient) {
2606 				/*
2607 				 * Recipient process is still alive, but may not
2608 				 * be in this task or project any longer.  In
2609 				 * this case, the recipient's resource control
2610 				 * set pertinent to this control will have
2611 				 * changed--and we will not deliver the signal,
2612 				 * as the recipient process is trying to tear
2613 				 * itself off of its former set.
2614 				 */
2615 				mutex_enter(&rp->p_lock);
2616 				mutex_exit(&pidlock);
2617 
2618 				if (rctl_entity_obtain_rset(rde, rp) == rset) {
2619 					ret |= RCT_SIGNAL;
2620 
2621 					if (sqp == NULL)
2622 						sigtoproc(rp, NULL,
2623 						    recip_signal);
2624 					else
2625 						sigaddqa(rp, NULL, sqp);
2626 				} else if (sqp) {
2627 					kmem_free(sqp, sizeof (sigqueue_t));
2628 				}
2629 				mutex_exit(&rp->p_lock);
2630 			} else {
2631 				mutex_exit(&pidlock);
2632 				if (sqp)
2633 					kmem_free(sqp, sizeof (sigqueue_t));
2634 			}
2635 
2636 			mutex_enter(&p->p_lock);
2637 			/*
2638 			 * Since we dropped p_lock, we may no longer be in the
2639 			 * same task or project as we were at entry.  It is thus
2640 			 * unsafe for us to reacquire the set lock at this
2641 			 * point; callers of rctl_local_action() must handle
2642 			 * this possibility.
2643 			 */
2644 			ret |= RCT_LK_ABANDONED;
2645 		} else if (sqp) {
2646 			kmem_free(sqp, sizeof (sigqueue_t));
2647 		}
2648 	}
2649 
2650 	if ((flagaction & RCTL_LOCAL_DENY) &&
2651 	    (recipient == NULL || recipient == p)) {
2652 		ret |= RCT_DENY;
2653 	}
2654 
2655 	return (ret);
2656 }
2657 
2658 /*
2659  * int rctl_action(rctl_hndl_t, rctl_set_t *, struct proc *, uint_t)
2660  *
2661  * Overview
2662  *   Take the action associated with the enforced value (as defined by
2663  *   rctl_get_enforced_value()) being exceeded or encountered.  Possibly perform
2664  *   a restricted subset of the available actions, if circumstances dictate that
2665  *   we cannot safely allocate memory (for a sigqueue_t) or guarantee process
2666  *   persistence across the duration of the function (an asynchronous action).
2667  *
2668  * Return values
2669  *   Actions taken, according to the rctl_test bitmask.
2670  *
2671  * Caller's context
2672  *   Safe to acquire rcs_lock.
2673  */
2674 int
2675 rctl_action(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p, uint_t safety)
2676 {
2677 	return (rctl_action_entity(hndl, rset, p, NULL, safety));
2678 }
2679 
2680 int
2681 rctl_action_entity(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p,
2682     rctl_entity_p_t *e, uint_t safety)
2683 {
2684 	int ret = RCT_NONE;
2685 	rctl_t *lrctl;
2686 	rctl_entity_p_t e_tmp;
2687 
2688 rctl_action_acquire:
2689 	mutex_enter(&rset->rcs_lock);
2690 	if (rctl_set_find(rset, hndl, &lrctl) == -1) {
2691 		mutex_exit(&rset->rcs_lock);
2692 		return (ret);
2693 	}
2694 
2695 	if (e == NULL) {
2696 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2697 		    p, &e_tmp);
2698 		e = &e_tmp;
2699 	}
2700 
2701 	if ((ret & RCT_LK_ABANDONED) == 0) {
2702 		ret |= rctl_global_action(lrctl, rset, p, lrctl->rc_cursor);
2703 
2704 		RCTLOP_ACTION(lrctl, p, e);
2705 
2706 		ret |= rctl_local_action(lrctl, rset, p,
2707 		    lrctl->rc_cursor, safety);
2708 
2709 		if (ret & RCT_LK_ABANDONED)
2710 			goto rctl_action_acquire;
2711 	}
2712 
2713 	ret &= ~RCT_LK_ABANDONED;
2714 
2715 	if (!(ret & RCT_DENY) &&
2716 	    lrctl->rc_cursor->rcv_next != NULL) {
2717 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2718 
2719 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2720 		    p, lrctl->rc_cursor->rcv_value));
2721 
2722 	}
2723 	mutex_exit(&rset->rcs_lock);
2724 
2725 	return (ret);
2726 }
2727 
2728 /*
2729  * int rctl_test(rctl_hndl_t, rctl_set_t *, struct proc *, rctl_qty_t, uint_t)
2730  *
2731  * Overview
2732  *   Increment the resource associated with the given handle, returning zero if
2733  *   the incremented value does not exceed the threshold for the current limit
2734  *   on the resource.
2735  *
2736  * Return values
2737  *   Actions taken, according to the rctl_test bitmask.
2738  *
2739  * Caller's context
2740  *   p_lock held by caller.
2741  */
2742 /*ARGSUSED*/
2743 int
2744 rctl_test(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2745     rctl_qty_t incr, uint_t flags)
2746 {
2747 	return (rctl_test_entity(rhndl, rset, p, NULL, incr, flags));
2748 }
2749 
2750 int
2751 rctl_test_entity(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2752     rctl_entity_p_t *e, rctl_qty_t incr, uint_t flags)
2753 {
2754 	rctl_t *lrctl;
2755 	int ret = RCT_NONE;
2756 	rctl_entity_p_t e_tmp;
2757 	if (p == &p0) {
2758 		/*
2759 		 * We don't enforce rctls on the kernel itself.
2760 		 */
2761 		return (ret);
2762 	}
2763 
2764 rctl_test_acquire:
2765 	ASSERT(MUTEX_HELD(&p->p_lock));
2766 
2767 	mutex_enter(&rset->rcs_lock);
2768 
2769 	/*
2770 	 * Dereference from rctl_set.  We don't enforce newly loaded controls
2771 	 * that haven't been set on this entity (since the only valid value is
2772 	 * the infinite system value).
2773 	 */
2774 	if (rctl_set_find(rset, rhndl, &lrctl) == -1) {
2775 		mutex_exit(&rset->rcs_lock);
2776 		return (ret);
2777 	}
2778 
2779 	/*
2780 	 * This control is currently unenforced:  maximal value on control
2781 	 * supporting infinitely available resource.
2782 	 */
2783 	if ((lrctl->rc_dict_entry->rcd_flagaction & RCTL_GLOBAL_INFINITE) &&
2784 	    (lrctl->rc_cursor->rcv_flagaction & RCTL_LOCAL_MAXIMAL)) {
2785 
2786 		mutex_exit(&rset->rcs_lock);
2787 		return (ret);
2788 	}
2789 
2790 	/*
2791 	 * If we have been called by rctl_test, look up the entity pointer
2792 	 * from the proc pointer.
2793 	 */
2794 	if (e == NULL) {
2795 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2796 		    p, &e_tmp);
2797 		e = &e_tmp;
2798 	}
2799 
2800 	/*
2801 	 * Get enforced rctl value and current usage.  Test the increment
2802 	 * with the current usage against the enforced value--take action as
2803 	 * necessary.
2804 	 */
2805 	while (RCTLOP_TEST(lrctl, p, e, lrctl->rc_cursor, incr, flags)) {
2806 		if ((ret & RCT_LK_ABANDONED) == 0) {
2807 			ret |= rctl_global_action(lrctl, rset, p,
2808 			    lrctl->rc_cursor);
2809 
2810 			RCTLOP_ACTION(lrctl, p, e);
2811 
2812 			ret |= rctl_local_action(lrctl, rset, p,
2813 			    lrctl->rc_cursor, flags);
2814 
2815 			if (ret & RCT_LK_ABANDONED)
2816 				goto rctl_test_acquire;
2817 		}
2818 
2819 		ret &= ~RCT_LK_ABANDONED;
2820 
2821 		if ((ret & RCT_DENY) == RCT_DENY ||
2822 		    lrctl->rc_cursor->rcv_next == NULL) {
2823 			ret |= RCT_DENY;
2824 			break;
2825 		}
2826 
2827 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2828 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2829 		    p, lrctl->rc_cursor->rcv_value));
2830 	}
2831 
2832 	mutex_exit(&rset->rcs_lock);
2833 
2834 	return (ret);
2835 }
2836 
2837 /*
2838  * void rctl_init(void)
2839  *
2840  * Overview
2841  *   Initialize the rctl subsystem, including the primoridal rctls
2842  *   provided by the system.  New subsystem-specific rctls should _not_ be
2843  *   initialized here.  (Do it in your own file.)
2844  *
2845  * Return values
2846  *   None.
2847  *
2848  * Caller's context
2849  *   Safe for KM_SLEEP allocations.  Must be called prior to any process model
2850  *   initialization.
2851  */
2852 void
2853 rctl_init(void)
2854 {
2855 	rctl_cache = kmem_cache_create("rctl_cache", sizeof (rctl_t),
2856 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2857 	rctl_val_cache = kmem_cache_create("rctl_val_cache",
2858 	    sizeof (rctl_val_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
2859 
2860 	rctl_dict = mod_hash_create_extended("rctl_dict",
2861 	    rctl_dict_size, mod_hash_null_keydtor, rctl_dict_val_dtor,
2862 	    rctl_dict_hash_by_id, NULL, rctl_dict_id_cmp, KM_SLEEP);
2863 	rctl_dict_by_name = mod_hash_create_strhash(
2864 	    "rctl_handles_by_name", rctl_dict_size,
2865 	    mod_hash_null_valdtor);
2866 	rctl_ids = id_space_create("rctl_ids", 1, max_rctl_hndl);
2867 	bzero(rctl_lists, (RC_MAX_ENTITY + 1) * sizeof (rctl_dict_entry_t *));
2868 
2869 	rctlproc_init();
2870 }
2871 
2872 /*
2873  * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2874  *     int chargeproc)
2875  *
2876  * Increments the amount of locked memory on a project, and
2877  * zone. If proj is non-NULL the project must be held by the
2878  * caller; if it is NULL the proj and zone of proc_t p are used.
2879  * If chargeproc is non-zero, then the charged amount is cached
2880  * on p->p_locked_mem so that the charge can be migrated when a
2881  * process changes projects.
2882  *
2883  * Return values
2884  *    0 - success
2885  *    EAGAIN - attempting to increment locked memory is denied by one
2886  *      or more resource entities.
2887  */
2888 int
2889 rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2890     int chargeproc)
2891 {
2892 	kproject_t *projp;
2893 	zone_t *zonep;
2894 	rctl_entity_p_t e;
2895 	int ret = 0;
2896 
2897 	ASSERT(p != NULL);
2898 	ASSERT(MUTEX_HELD(&p->p_lock));
2899 	if (proj != NULL) {
2900 		projp = proj;
2901 		zonep = proj->kpj_zone;
2902 	} else {
2903 		projp = p->p_task->tk_proj;
2904 		zonep = p->p_zone;
2905 	}
2906 
2907 	mutex_enter(&zonep->zone_mem_lock);
2908 
2909 	e.rcep_p.proj = projp;
2910 	e.rcep_t = RCENTITY_PROJECT;
2911 
2912 	/* check for overflow */
2913 	if ((projp->kpj_data.kpd_locked_mem + inc) <
2914 	    projp->kpj_data.kpd_locked_mem) {
2915 		ret = EAGAIN;
2916 		goto out;
2917 	}
2918 	if (projp->kpj_data.kpd_locked_mem + inc >
2919 	    projp->kpj_data.kpd_locked_mem_ctl) {
2920 		if (rctl_test_entity(rc_project_locked_mem, projp->kpj_rctls,
2921 		    p, &e, inc, 0) & RCT_DENY) {
2922 			ret = EAGAIN;
2923 			goto out;
2924 		}
2925 	}
2926 	e.rcep_p.zone = zonep;
2927 	e.rcep_t = RCENTITY_ZONE;
2928 
2929 	/* Check for overflow */
2930 	if ((zonep->zone_locked_mem + inc) < zonep->zone_locked_mem) {
2931 		ret = EAGAIN;
2932 		goto out;
2933 	}
2934 	if (zonep->zone_locked_mem + inc > zonep->zone_locked_mem_ctl) {
2935 		if (rctl_test_entity(rc_zone_locked_mem, zonep->zone_rctls,
2936 		    p, &e, inc, 0) & RCT_DENY) {
2937 			ret = EAGAIN;
2938 			goto out;
2939 		}
2940 	}
2941 
2942 	zonep->zone_locked_mem += inc;
2943 	projp->kpj_data.kpd_locked_mem += inc;
2944 	if (chargeproc != 0) {
2945 		p->p_locked_mem += inc;
2946 	}
2947 out:
2948 	mutex_exit(&zonep->zone_mem_lock);
2949 	return (ret);
2950 }
2951 
2952 /*
2953  * rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2954  *     int creditproc)
2955  *
2956  * Decrements the amount of locked memory on a project and
2957  * zone.  If proj is non-NULL the project must be held by the
2958  * caller; if it is NULL the proj and zone of proc_t p are used.
2959  * If creditproc is non-zero, then the quantity of locked memory
2960  * is subtracted from p->p_locked_mem.
2961  *
2962  * Return values
2963  *   none
2964  */
2965 void
2966 rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2967     int creditproc)
2968 {
2969 	kproject_t *projp;
2970 	zone_t *zonep;
2971 
2972 	if (proj != NULL) {
2973 		projp = proj;
2974 		zonep = proj->kpj_zone;
2975 	} else {
2976 		ASSERT(p != NULL);
2977 		ASSERT(MUTEX_HELD(&p->p_lock));
2978 		projp = p->p_task->tk_proj;
2979 		zonep = p->p_zone;
2980 	}
2981 
2982 	mutex_enter(&zonep->zone_mem_lock);
2983 	zonep->zone_locked_mem -= inc;
2984 	projp->kpj_data.kpd_locked_mem -= inc;
2985 	if (creditproc != 0) {
2986 		ASSERT(p != NULL);
2987 		ASSERT(MUTEX_HELD(&p->p_lock));
2988 		p->p_locked_mem -= inc;
2989 	}
2990 	mutex_exit(&zonep->zone_mem_lock);
2991 }
2992 
2993 /*
2994  * rctl_incr_swap(proc_t *, zone_t *, size_t)
2995  *
2996  * Overview
2997  *   Increments the swap charge on the specified zone.
2998  *
2999  * Return values
3000  *   0 on success.  EAGAIN if swap increment fails due an rctl value
3001  *   on the zone.
3002  *
3003  * Callers context
3004  *   p_lock held on specified proc.
3005  *   swap must be even multiple of PAGESIZE
3006  */
3007 int
3008 rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap)
3009 {
3010 	rctl_entity_p_t e;
3011 
3012 	ASSERT(MUTEX_HELD(&proc->p_lock));
3013 	ASSERT((swap & PAGEOFFSET) == 0);
3014 	e.rcep_p.zone = zone;
3015 	e.rcep_t = RCENTITY_ZONE;
3016 
3017 	mutex_enter(&zone->zone_mem_lock);
3018 
3019 	/* Check for overflow */
3020 	if ((zone->zone_max_swap + swap) < zone->zone_max_swap) {
3021 		mutex_exit(&zone->zone_mem_lock);
3022 		return (EAGAIN);
3023 	}
3024 	if ((zone->zone_max_swap + swap) >
3025 	    zone->zone_max_swap_ctl) {
3026 
3027 		if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls,
3028 		    proc, &e, swap, 0) & RCT_DENY) {
3029 			mutex_exit(&zone->zone_mem_lock);
3030 			return (EAGAIN);
3031 		}
3032 	}
3033 	zone->zone_max_swap += swap;
3034 	mutex_exit(&zone->zone_mem_lock);
3035 	return (0);
3036 }
3037 
3038 /*
3039  * rctl_decr_swap(zone_t *, size_t)
3040  *
3041  * Overview
3042  *   Decrements the swap charge on the specified zone.
3043  *
3044  * Return values
3045  *   None
3046  *
3047  * Callers context
3048  *   swap must be even multiple of PAGESIZE
3049  */
3050 void
3051 rctl_decr_swap(zone_t *zone, size_t swap)
3052 {
3053 	ASSERT((swap & PAGEOFFSET) == 0);
3054 	mutex_enter(&zone->zone_mem_lock);
3055 	ASSERT(zone->zone_max_swap >= swap);
3056 	zone->zone_max_swap -= swap;
3057 	mutex_exit(&zone->zone_mem_lock);
3058 }
3059 
3060 /*
3061  * Create resource kstat
3062  */
3063 static kstat_t *
3064 rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class,
3065     uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid)
3066 {
3067 	kstat_t *ksp = NULL;
3068 	char name[KSTAT_STRLEN];
3069 
3070 	(void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance);
3071 
3072 	if ((ksp = kstat_create_zone("caps", ks_zoneid,
3073 	    name, ks_class, ks_type,
3074 	    ks_ndata, ks_flags, ks_zoneid)) != NULL) {
3075 		if (ks_zoneid != GLOBAL_ZONEID)
3076 			kstat_zone_add(ksp, GLOBAL_ZONEID);
3077 	}
3078 	return (ksp);
3079 }
3080 
3081 /*
3082  * Create zone-specific resource kstat
3083  */
3084 kstat_t *
3085 rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type,
3086     uint_t ks_ndata, uchar_t ks_flags)
3087 {
3088 	char name[KSTAT_STRLEN];
3089 
3090 	(void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name);
3091 
3092 	return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps",
3093 	    ks_type, ks_ndata, ks_flags, zone->zone_id));
3094 }
3095 
3096 /*
3097  * Create project-specific resource kstat
3098  */
3099 kstat_t *
3100 rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type,
3101     uint_t ks_ndata, uchar_t ks_flags)
3102 {
3103 	char name[KSTAT_STRLEN];
3104 
3105 	(void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name);
3106 
3107 	return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps",
3108 	    ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid));
3109 }
3110