xref: /freebsd/sys/kern/kern_rctl.c (revision 3823d5e198425b4f5e5a80267d195769d1063773)
1 /*-
2  * Copyright (c) 2010 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/malloc.h>
38 #include <sys/queue.h>
39 #include <sys/refcount.h>
40 #include <sys/jail.h>
41 #include <sys/kernel.h>
42 #include <sys/limits.h>
43 #include <sys/loginclass.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/racct.h>
47 #include <sys/rctl.h>
48 #include <sys/resourcevar.h>
49 #include <sys/sx.h>
50 #include <sys/sysent.h>
51 #include <sys/sysproto.h>
52 #include <sys/systm.h>
53 #include <sys/types.h>
54 #include <sys/eventhandler.h>
55 #include <sys/lock.h>
56 #include <sys/mutex.h>
57 #include <sys/rwlock.h>
58 #include <sys/sbuf.h>
59 #include <sys/taskqueue.h>
60 #include <sys/tree.h>
61 #include <vm/uma.h>
62 
63 #ifdef RCTL
64 #ifndef RACCT
65 #error "The RCTL option requires the RACCT option"
66 #endif
67 
68 FEATURE(rctl, "Resource Limits");
69 
70 #define	HRF_DEFAULT		0
71 #define	HRF_DONT_INHERIT	1
72 #define	HRF_DONT_ACCUMULATE	2
73 
74 /* Default buffer size for rctl_get_rules(2). */
75 #define	RCTL_DEFAULT_BUFSIZE	4096
76 #define	RCTL_MAX_INBUFLEN	4096
77 #define	RCTL_LOG_BUFSIZE	128
78 
79 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
80 
81 /*
82  * 'rctl_rule_link' connects a rule with every racct it's related to.
83  * For example, rule 'user:X:openfiles:deny=N/process' is linked
84  * with uidinfo for user X, and to each process of that user.
85  */
86 struct rctl_rule_link {
87 	LIST_ENTRY(rctl_rule_link)	rrl_next;
88 	struct rctl_rule		*rrl_rule;
89 	int				rrl_exceeded;
90 };
91 
92 struct dict {
93 	const char	*d_name;
94 	int		d_value;
95 };
96 
97 static struct dict subjectnames[] = {
98 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
99 	{ "user", RCTL_SUBJECT_TYPE_USER },
100 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
101 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
102 	{ NULL, -1 }};
103 
104 static struct dict resourcenames[] = {
105 	{ "cputime", RACCT_CPU },
106 	{ "datasize", RACCT_DATA },
107 	{ "stacksize", RACCT_STACK },
108 	{ "coredumpsize", RACCT_CORE },
109 	{ "memoryuse", RACCT_RSS },
110 	{ "memorylocked", RACCT_MEMLOCK },
111 	{ "maxproc", RACCT_NPROC },
112 	{ "openfiles", RACCT_NOFILE },
113 	{ "vmemoryuse", RACCT_VMEM },
114 	{ "pseudoterminals", RACCT_NPTS },
115 	{ "swapuse", RACCT_SWAP },
116 	{ "nthr", RACCT_NTHR },
117 	{ "msgqqueued", RACCT_MSGQQUEUED },
118 	{ "msgqsize", RACCT_MSGQSIZE },
119 	{ "nmsgq", RACCT_NMSGQ },
120 	{ "nsem", RACCT_NSEM },
121 	{ "nsemop", RACCT_NSEMOP },
122 	{ "nshm", RACCT_NSHM },
123 	{ "shmsize", RACCT_SHMSIZE },
124 	{ "wallclock", RACCT_WALLCLOCK },
125 	{ "pcpu", RACCT_PCTCPU },
126 	{ NULL, -1 }};
127 
128 static struct dict actionnames[] = {
129 	{ "sighup", RCTL_ACTION_SIGHUP },
130 	{ "sigint", RCTL_ACTION_SIGINT },
131 	{ "sigquit", RCTL_ACTION_SIGQUIT },
132 	{ "sigill", RCTL_ACTION_SIGILL },
133 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
134 	{ "sigabrt", RCTL_ACTION_SIGABRT },
135 	{ "sigemt", RCTL_ACTION_SIGEMT },
136 	{ "sigfpe", RCTL_ACTION_SIGFPE },
137 	{ "sigkill", RCTL_ACTION_SIGKILL },
138 	{ "sigbus", RCTL_ACTION_SIGBUS },
139 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
140 	{ "sigsys", RCTL_ACTION_SIGSYS },
141 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
142 	{ "sigalrm", RCTL_ACTION_SIGALRM },
143 	{ "sigterm", RCTL_ACTION_SIGTERM },
144 	{ "sigurg", RCTL_ACTION_SIGURG },
145 	{ "sigstop", RCTL_ACTION_SIGSTOP },
146 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
147 	{ "sigchld", RCTL_ACTION_SIGCHLD },
148 	{ "sigttin", RCTL_ACTION_SIGTTIN },
149 	{ "sigttou", RCTL_ACTION_SIGTTOU },
150 	{ "sigio", RCTL_ACTION_SIGIO },
151 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
152 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
153 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
154 	{ "sigprof", RCTL_ACTION_SIGPROF },
155 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
156 	{ "siginfo", RCTL_ACTION_SIGINFO },
157 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
158 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
159 	{ "sigthr", RCTL_ACTION_SIGTHR },
160 	{ "deny", RCTL_ACTION_DENY },
161 	{ "log", RCTL_ACTION_LOG },
162 	{ "devctl", RCTL_ACTION_DEVCTL },
163 	{ NULL, -1 }};
164 
165 static void rctl_init(void);
166 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
167 
168 static uma_zone_t rctl_rule_link_zone;
169 static uma_zone_t rctl_rule_zone;
170 static struct rwlock rctl_lock;
171 RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
172 
173 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
174 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
175 
176 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
177 
178 static const char *
179 rctl_subject_type_name(int subject)
180 {
181 	int i;
182 
183 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
184 		if (subjectnames[i].d_value == subject)
185 			return (subjectnames[i].d_name);
186 	}
187 
188 	panic("rctl_subject_type_name: unknown subject type %d", subject);
189 }
190 
191 static const char *
192 rctl_action_name(int action)
193 {
194 	int i;
195 
196 	for (i = 0; actionnames[i].d_name != NULL; i++) {
197 		if (actionnames[i].d_value == action)
198 			return (actionnames[i].d_name);
199 	}
200 
201 	panic("rctl_action_name: unknown action %d", action);
202 }
203 
204 const char *
205 rctl_resource_name(int resource)
206 {
207 	int i;
208 
209 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
210 		if (resourcenames[i].d_value == resource)
211 			return (resourcenames[i].d_name);
212 	}
213 
214 	panic("rctl_resource_name: unknown resource %d", resource);
215 }
216 
217 /*
218  * Return the amount of resource that can be allocated by 'p' before
219  * hitting 'rule'.
220  */
221 static int64_t
222 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
223 {
224 	int resource;
225 	int64_t available = INT64_MAX;
226 	struct ucred *cred = p->p_ucred;
227 
228 	rw_assert(&rctl_lock, RA_LOCKED);
229 
230 	resource = rule->rr_resource;
231 	switch (rule->rr_per) {
232 	case RCTL_SUBJECT_TYPE_PROCESS:
233 		available = rule->rr_amount -
234 		    p->p_racct->r_resources[resource];
235 		break;
236 	case RCTL_SUBJECT_TYPE_USER:
237 		available = rule->rr_amount -
238 		    cred->cr_ruidinfo->ui_racct->r_resources[resource];
239 		break;
240 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
241 		available = rule->rr_amount -
242 		    cred->cr_loginclass->lc_racct->r_resources[resource];
243 		break;
244 	case RCTL_SUBJECT_TYPE_JAIL:
245 		available = rule->rr_amount -
246 		    cred->cr_prison->pr_prison_racct->prr_racct->
247 		        r_resources[resource];
248 		break;
249 	default:
250 		panic("rctl_compute_available: unknown per %d",
251 		    rule->rr_per);
252 	}
253 
254 	return (available);
255 }
256 
257 /*
258  * Return non-zero if allocating 'amount' by proc 'p' would exceed
259  * resource limit specified by 'rule'.
260  */
261 static int
262 rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
263     int64_t amount)
264 {
265 	int64_t available;
266 
267 	rw_assert(&rctl_lock, RA_LOCKED);
268 
269 	available = rctl_available_resource(p, rule);
270 	if (available >= amount)
271 		return (0);
272 
273 	return (1);
274 }
275 
276 /*
277  * Special version of rctl_available() function for the %cpu resource.
278  * We slightly cheat here and return less than we normally would.
279  */
280 int64_t
281 rctl_pcpu_available(const struct proc *p) {
282 	struct rctl_rule *rule;
283 	struct rctl_rule_link *link;
284 	int64_t available, minavailable, limit;
285 
286 	minavailable = INT64_MAX;
287 	limit = 0;
288 
289 	rw_rlock(&rctl_lock);
290 
291 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
292 		rule = link->rrl_rule;
293 		if (rule->rr_resource != RACCT_PCTCPU)
294 			continue;
295 		if (rule->rr_action != RCTL_ACTION_DENY)
296 			continue;
297 		available = rctl_available_resource(p, rule);
298 		if (available < minavailable) {
299 			minavailable = available;
300 			limit = rule->rr_amount;
301 		}
302 	}
303 
304 	rw_runlock(&rctl_lock);
305 
306 	/*
307 	 * Return slightly less than actual value of the available
308 	 * %cpu resource.  This makes %cpu throttling more agressive
309 	 * and lets us act sooner than the limits are already exceeded.
310 	 */
311 	if (limit != 0) {
312 		if (limit > 2 * RCTL_PCPU_SHIFT)
313 			minavailable -= RCTL_PCPU_SHIFT;
314 		else
315 			minavailable -= (limit / 2);
316 	}
317 
318 	return (minavailable);
319 }
320 
321 /*
322  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
323  * to what it keeps allocated now.  Returns non-zero if the allocation should
324  * be denied, 0 otherwise.
325  */
326 int
327 rctl_enforce(struct proc *p, int resource, uint64_t amount)
328 {
329 	struct rctl_rule *rule;
330 	struct rctl_rule_link *link;
331 	struct sbuf sb;
332 	int should_deny = 0;
333 	char *buf;
334 	static int curtime = 0;
335 	static struct timeval lasttime;
336 
337 	rw_rlock(&rctl_lock);
338 
339 	/*
340 	 * There may be more than one matching rule; go through all of them.
341 	 * Denial should be done last, after logging and sending signals.
342 	 */
343 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
344 		rule = link->rrl_rule;
345 		if (rule->rr_resource != resource)
346 			continue;
347 		if (!rctl_would_exceed(p, rule, amount)) {
348 			link->rrl_exceeded = 0;
349 			continue;
350 		}
351 
352 		switch (rule->rr_action) {
353 		case RCTL_ACTION_DENY:
354 			should_deny = 1;
355 			continue;
356 		case RCTL_ACTION_LOG:
357 			/*
358 			 * If rrl_exceeded != 0, it means we've already
359 			 * logged a warning for this process.
360 			 */
361 			if (link->rrl_exceeded != 0)
362 				continue;
363 
364 			/*
365 			 * If the process state is not fully initialized yet,
366 			 * we can't access most of the required fields, e.g.
367 			 * p->p_comm.  This happens when called from fork1().
368 			 * Ignore this rule for now; it will be processed just
369 			 * after fork, when called from racct_proc_fork_done().
370 			 */
371 			if (p->p_state != PRS_NORMAL)
372 				continue;
373 
374 			if (!ppsratecheck(&lasttime, &curtime, 10))
375 				continue;
376 
377 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
378 			if (buf == NULL) {
379 				printf("rctl_enforce: out of memory\n");
380 				continue;
381 			}
382 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
383 			rctl_rule_to_sbuf(&sb, rule);
384 			sbuf_finish(&sb);
385 			printf("rctl: rule \"%s\" matched by pid %d "
386 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
387 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
388 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
389 			sbuf_delete(&sb);
390 			free(buf, M_RCTL);
391 			link->rrl_exceeded = 1;
392 			continue;
393 		case RCTL_ACTION_DEVCTL:
394 			if (link->rrl_exceeded != 0)
395 				continue;
396 
397 			if (p->p_state != PRS_NORMAL)
398 				continue;
399 
400 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
401 			if (buf == NULL) {
402 				printf("rctl_enforce: out of memory\n");
403 				continue;
404 			}
405 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
406 			sbuf_printf(&sb, "rule=");
407 			rctl_rule_to_sbuf(&sb, rule);
408 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
409 			    p->p_pid, p->p_ucred->cr_ruid,
410 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
411 			sbuf_finish(&sb);
412 			devctl_notify_f("RCTL", "rule", "matched",
413 			    sbuf_data(&sb), M_NOWAIT);
414 			sbuf_delete(&sb);
415 			free(buf, M_RCTL);
416 			link->rrl_exceeded = 1;
417 			continue;
418 		default:
419 			if (link->rrl_exceeded != 0)
420 				continue;
421 
422 			if (p->p_state != PRS_NORMAL)
423 				continue;
424 
425 			KASSERT(rule->rr_action > 0 &&
426 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
427 			    ("rctl_enforce: unknown action %d",
428 			     rule->rr_action));
429 
430 			/*
431 			 * We're using the fact that RCTL_ACTION_SIG* values
432 			 * are equal to their counterparts from sys/signal.h.
433 			 */
434 			kern_psignal(p, rule->rr_action);
435 			link->rrl_exceeded = 1;
436 			continue;
437 		}
438 	}
439 
440 	rw_runlock(&rctl_lock);
441 
442 	if (should_deny) {
443 		/*
444 		 * Return fake error code; the caller should change it
445 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
446 		 */
447 		return (EDOOFUS);
448 	}
449 
450 	return (0);
451 }
452 
453 uint64_t
454 rctl_get_limit(struct proc *p, int resource)
455 {
456 	struct rctl_rule *rule;
457 	struct rctl_rule_link *link;
458 	uint64_t amount = UINT64_MAX;
459 
460 	rw_rlock(&rctl_lock);
461 
462 	/*
463 	 * There may be more than one matching rule; go through all of them.
464 	 * Denial should be done last, after logging and sending signals.
465 	 */
466 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
467 		rule = link->rrl_rule;
468 		if (rule->rr_resource != resource)
469 			continue;
470 		if (rule->rr_action != RCTL_ACTION_DENY)
471 			continue;
472 		if (rule->rr_amount < amount)
473 			amount = rule->rr_amount;
474 	}
475 
476 	rw_runlock(&rctl_lock);
477 
478 	return (amount);
479 }
480 
481 uint64_t
482 rctl_get_available(struct proc *p, int resource)
483 {
484 	struct rctl_rule *rule;
485 	struct rctl_rule_link *link;
486 	int64_t available, minavailable, allocated;
487 
488 	minavailable = INT64_MAX;
489 
490 	rw_rlock(&rctl_lock);
491 
492 	/*
493 	 * There may be more than one matching rule; go through all of them.
494 	 * Denial should be done last, after logging and sending signals.
495 	 */
496 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
497 		rule = link->rrl_rule;
498 		if (rule->rr_resource != resource)
499 			continue;
500 		if (rule->rr_action != RCTL_ACTION_DENY)
501 			continue;
502 		available = rctl_available_resource(p, rule);
503 		if (available < minavailable)
504 			minavailable = available;
505 	}
506 
507 	rw_runlock(&rctl_lock);
508 
509 	/*
510 	 * XXX: Think about this _hard_.
511 	 */
512 	allocated = p->p_racct->r_resources[resource];
513 	if (minavailable < INT64_MAX - allocated)
514 		minavailable += allocated;
515 	if (minavailable < 0)
516 		minavailable = 0;
517 	return (minavailable);
518 }
519 
520 static int
521 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
522 {
523 
524 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
525 		if (rule->rr_subject_type != filter->rr_subject_type)
526 			return (0);
527 
528 		switch (filter->rr_subject_type) {
529 		case RCTL_SUBJECT_TYPE_PROCESS:
530 			if (filter->rr_subject.rs_proc != NULL &&
531 			    rule->rr_subject.rs_proc !=
532 			    filter->rr_subject.rs_proc)
533 				return (0);
534 			break;
535 		case RCTL_SUBJECT_TYPE_USER:
536 			if (filter->rr_subject.rs_uip != NULL &&
537 			    rule->rr_subject.rs_uip !=
538 			    filter->rr_subject.rs_uip)
539 				return (0);
540 			break;
541 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
542 			if (filter->rr_subject.rs_loginclass != NULL &&
543 			    rule->rr_subject.rs_loginclass !=
544 			    filter->rr_subject.rs_loginclass)
545 				return (0);
546 			break;
547 		case RCTL_SUBJECT_TYPE_JAIL:
548 			if (filter->rr_subject.rs_prison_racct != NULL &&
549 			    rule->rr_subject.rs_prison_racct !=
550 			    filter->rr_subject.rs_prison_racct)
551 				return (0);
552 			break;
553 		default:
554 			panic("rctl_rule_matches: unknown subject type %d",
555 			    filter->rr_subject_type);
556 		}
557 	}
558 
559 	if (filter->rr_resource != RACCT_UNDEFINED) {
560 		if (rule->rr_resource != filter->rr_resource)
561 			return (0);
562 	}
563 
564 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
565 		if (rule->rr_action != filter->rr_action)
566 			return (0);
567 	}
568 
569 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
570 		if (rule->rr_amount != filter->rr_amount)
571 			return (0);
572 	}
573 
574 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
575 		if (rule->rr_per != filter->rr_per)
576 			return (0);
577 	}
578 
579 	return (1);
580 }
581 
582 static int
583 str2value(const char *str, int *value, struct dict *table)
584 {
585 	int i;
586 
587 	if (value == NULL)
588 		return (EINVAL);
589 
590 	for (i = 0; table[i].d_name != NULL; i++) {
591 		if (strcasecmp(table[i].d_name, str) == 0) {
592 			*value =  table[i].d_value;
593 			return (0);
594 		}
595 	}
596 
597 	return (EINVAL);
598 }
599 
600 static int
601 str2id(const char *str, id_t *value)
602 {
603 	char *end;
604 
605 	if (str == NULL)
606 		return (EINVAL);
607 
608 	*value = strtoul(str, &end, 10);
609 	if ((size_t)(end - str) != strlen(str))
610 		return (EINVAL);
611 
612 	return (0);
613 }
614 
615 static int
616 str2int64(const char *str, int64_t *value)
617 {
618 	char *end;
619 
620 	if (str == NULL)
621 		return (EINVAL);
622 
623 	*value = strtoul(str, &end, 10);
624 	if ((size_t)(end - str) != strlen(str))
625 		return (EINVAL);
626 
627 	return (0);
628 }
629 
630 /*
631  * Connect the rule to the racct, increasing refcount for the rule.
632  */
633 static void
634 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
635 {
636 	struct rctl_rule_link *link;
637 
638 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
639 
640 	rctl_rule_acquire(rule);
641 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
642 	link->rrl_rule = rule;
643 	link->rrl_exceeded = 0;
644 
645 	rw_wlock(&rctl_lock);
646 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
647 	rw_wunlock(&rctl_lock);
648 }
649 
650 static int
651 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
652 {
653 	struct rctl_rule_link *link;
654 
655 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
656 	rw_assert(&rctl_lock, RA_WLOCKED);
657 
658 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
659 	if (link == NULL)
660 		return (ENOMEM);
661 	rctl_rule_acquire(rule);
662 	link->rrl_rule = rule;
663 	link->rrl_exceeded = 0;
664 
665 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
666 	return (0);
667 }
668 
669 /*
670  * Remove limits for a rules matching the filter and release
671  * the refcounts for the rules, possibly freeing them.  Returns
672  * the number of limit structures removed.
673  */
674 static int
675 rctl_racct_remove_rules(struct racct *racct,
676     const struct rctl_rule *filter)
677 {
678 	int removed = 0;
679 	struct rctl_rule_link *link, *linktmp;
680 
681 	rw_assert(&rctl_lock, RA_WLOCKED);
682 
683 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
684 		if (!rctl_rule_matches(link->rrl_rule, filter))
685 			continue;
686 
687 		LIST_REMOVE(link, rrl_next);
688 		rctl_rule_release(link->rrl_rule);
689 		uma_zfree(rctl_rule_link_zone, link);
690 		removed++;
691 	}
692 	return (removed);
693 }
694 
695 static void
696 rctl_rule_acquire_subject(struct rctl_rule *rule)
697 {
698 
699 	switch (rule->rr_subject_type) {
700 	case RCTL_SUBJECT_TYPE_UNDEFINED:
701 	case RCTL_SUBJECT_TYPE_PROCESS:
702 		break;
703 	case RCTL_SUBJECT_TYPE_JAIL:
704 		if (rule->rr_subject.rs_prison_racct != NULL)
705 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
706 		break;
707 	case RCTL_SUBJECT_TYPE_USER:
708 		if (rule->rr_subject.rs_uip != NULL)
709 			uihold(rule->rr_subject.rs_uip);
710 		break;
711 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
712 		if (rule->rr_subject.rs_loginclass != NULL)
713 			loginclass_hold(rule->rr_subject.rs_loginclass);
714 		break;
715 	default:
716 		panic("rctl_rule_acquire_subject: unknown subject type %d",
717 		    rule->rr_subject_type);
718 	}
719 }
720 
721 static void
722 rctl_rule_release_subject(struct rctl_rule *rule)
723 {
724 
725 	switch (rule->rr_subject_type) {
726 	case RCTL_SUBJECT_TYPE_UNDEFINED:
727 	case RCTL_SUBJECT_TYPE_PROCESS:
728 		break;
729 	case RCTL_SUBJECT_TYPE_JAIL:
730 		if (rule->rr_subject.rs_prison_racct != NULL)
731 			prison_racct_free(rule->rr_subject.rs_prison_racct);
732 		break;
733 	case RCTL_SUBJECT_TYPE_USER:
734 		if (rule->rr_subject.rs_uip != NULL)
735 			uifree(rule->rr_subject.rs_uip);
736 		break;
737 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
738 		if (rule->rr_subject.rs_loginclass != NULL)
739 			loginclass_free(rule->rr_subject.rs_loginclass);
740 		break;
741 	default:
742 		panic("rctl_rule_release_subject: unknown subject type %d",
743 		    rule->rr_subject_type);
744 	}
745 }
746 
747 struct rctl_rule *
748 rctl_rule_alloc(int flags)
749 {
750 	struct rctl_rule *rule;
751 
752 	rule = uma_zalloc(rctl_rule_zone, flags);
753 	if (rule == NULL)
754 		return (NULL);
755 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
756 	rule->rr_subject.rs_proc = NULL;
757 	rule->rr_subject.rs_uip = NULL;
758 	rule->rr_subject.rs_loginclass = NULL;
759 	rule->rr_subject.rs_prison_racct = NULL;
760 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
761 	rule->rr_resource = RACCT_UNDEFINED;
762 	rule->rr_action = RCTL_ACTION_UNDEFINED;
763 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
764 	refcount_init(&rule->rr_refcount, 1);
765 
766 	return (rule);
767 }
768 
769 struct rctl_rule *
770 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
771 {
772 	struct rctl_rule *copy;
773 
774 	copy = uma_zalloc(rctl_rule_zone, flags);
775 	if (copy == NULL)
776 		return (NULL);
777 	copy->rr_subject_type = rule->rr_subject_type;
778 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
779 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
780 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
781 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
782 	copy->rr_per = rule->rr_per;
783 	copy->rr_resource = rule->rr_resource;
784 	copy->rr_action = rule->rr_action;
785 	copy->rr_amount = rule->rr_amount;
786 	refcount_init(&copy->rr_refcount, 1);
787 	rctl_rule_acquire_subject(copy);
788 
789 	return (copy);
790 }
791 
792 void
793 rctl_rule_acquire(struct rctl_rule *rule)
794 {
795 
796 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
797 
798 	refcount_acquire(&rule->rr_refcount);
799 }
800 
801 static void
802 rctl_rule_free(void *context, int pending)
803 {
804 	struct rctl_rule *rule;
805 
806 	rule = (struct rctl_rule *)context;
807 
808 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
809 
810 	/*
811 	 * We don't need locking here; rule is guaranteed to be inaccessible.
812 	 */
813 
814 	rctl_rule_release_subject(rule);
815 	uma_zfree(rctl_rule_zone, rule);
816 }
817 
818 void
819 rctl_rule_release(struct rctl_rule *rule)
820 {
821 
822 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
823 
824 	if (refcount_release(&rule->rr_refcount)) {
825 		/*
826 		 * rctl_rule_release() is often called when iterating
827 		 * over all the uidinfo structures in the system,
828 		 * holding uihashtbl_lock.  Since rctl_rule_free()
829 		 * might end up calling uifree(), this would lead
830 		 * to lock recursion.  Use taskqueue to avoid this.
831 		 */
832 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
833 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
834 	}
835 }
836 
837 static int
838 rctl_rule_fully_specified(const struct rctl_rule *rule)
839 {
840 
841 	switch (rule->rr_subject_type) {
842 	case RCTL_SUBJECT_TYPE_UNDEFINED:
843 		return (0);
844 	case RCTL_SUBJECT_TYPE_PROCESS:
845 		if (rule->rr_subject.rs_proc == NULL)
846 			return (0);
847 		break;
848 	case RCTL_SUBJECT_TYPE_USER:
849 		if (rule->rr_subject.rs_uip == NULL)
850 			return (0);
851 		break;
852 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
853 		if (rule->rr_subject.rs_loginclass == NULL)
854 			return (0);
855 		break;
856 	case RCTL_SUBJECT_TYPE_JAIL:
857 		if (rule->rr_subject.rs_prison_racct == NULL)
858 			return (0);
859 		break;
860 	default:
861 		panic("rctl_rule_fully_specified: unknown subject type %d",
862 		    rule->rr_subject_type);
863 	}
864 	if (rule->rr_resource == RACCT_UNDEFINED)
865 		return (0);
866 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
867 		return (0);
868 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
869 		return (0);
870 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
871 		return (0);
872 
873 	return (1);
874 }
875 
876 static int
877 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
878 {
879 	int error = 0;
880 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
881 	     *amountstr, *perstr;
882 	struct rctl_rule *rule;
883 	id_t id;
884 
885 	rule = rctl_rule_alloc(M_WAITOK);
886 
887 	subjectstr = strsep(&rulestr, ":");
888 	subject_idstr = strsep(&rulestr, ":");
889 	resourcestr = strsep(&rulestr, ":");
890 	actionstr = strsep(&rulestr, "=/");
891 	amountstr = strsep(&rulestr, "/");
892 	perstr = rulestr;
893 
894 	if (subjectstr == NULL || subjectstr[0] == '\0')
895 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
896 	else {
897 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
898 		if (error != 0)
899 			goto out;
900 	}
901 
902 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
903 		rule->rr_subject.rs_proc = NULL;
904 		rule->rr_subject.rs_uip = NULL;
905 		rule->rr_subject.rs_loginclass = NULL;
906 		rule->rr_subject.rs_prison_racct = NULL;
907 	} else {
908 		switch (rule->rr_subject_type) {
909 		case RCTL_SUBJECT_TYPE_UNDEFINED:
910 			error = EINVAL;
911 			goto out;
912 		case RCTL_SUBJECT_TYPE_PROCESS:
913 			error = str2id(subject_idstr, &id);
914 			if (error != 0)
915 				goto out;
916 			sx_assert(&allproc_lock, SA_LOCKED);
917 			rule->rr_subject.rs_proc = pfind(id);
918 			if (rule->rr_subject.rs_proc == NULL) {
919 				error = ESRCH;
920 				goto out;
921 			}
922 			PROC_UNLOCK(rule->rr_subject.rs_proc);
923 			break;
924 		case RCTL_SUBJECT_TYPE_USER:
925 			error = str2id(subject_idstr, &id);
926 			if (error != 0)
927 				goto out;
928 			rule->rr_subject.rs_uip = uifind(id);
929 			break;
930 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
931 			rule->rr_subject.rs_loginclass =
932 			    loginclass_find(subject_idstr);
933 			if (rule->rr_subject.rs_loginclass == NULL) {
934 				error = ENAMETOOLONG;
935 				goto out;
936 			}
937 			break;
938 		case RCTL_SUBJECT_TYPE_JAIL:
939 			rule->rr_subject.rs_prison_racct =
940 			    prison_racct_find(subject_idstr);
941 			if (rule->rr_subject.rs_prison_racct == NULL) {
942 				error = ENAMETOOLONG;
943 				goto out;
944 			}
945 			break;
946                default:
947                        panic("rctl_string_to_rule: unknown subject type %d",
948                            rule->rr_subject_type);
949                }
950 	}
951 
952 	if (resourcestr == NULL || resourcestr[0] == '\0')
953 		rule->rr_resource = RACCT_UNDEFINED;
954 	else {
955 		error = str2value(resourcestr, &rule->rr_resource,
956 		    resourcenames);
957 		if (error != 0)
958 			goto out;
959 	}
960 
961 	if (actionstr == NULL || actionstr[0] == '\0')
962 		rule->rr_action = RCTL_ACTION_UNDEFINED;
963 	else {
964 		error = str2value(actionstr, &rule->rr_action, actionnames);
965 		if (error != 0)
966 			goto out;
967 	}
968 
969 	if (amountstr == NULL || amountstr[0] == '\0')
970 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
971 	else {
972 		error = str2int64(amountstr, &rule->rr_amount);
973 		if (error != 0)
974 			goto out;
975 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
976 			rule->rr_amount *= 1000000;
977 	}
978 
979 	if (perstr == NULL || perstr[0] == '\0')
980 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
981 	else {
982 		error = str2value(perstr, &rule->rr_per, subjectnames);
983 		if (error != 0)
984 			goto out;
985 	}
986 
987 out:
988 	if (error == 0)
989 		*rulep = rule;
990 	else
991 		rctl_rule_release(rule);
992 
993 	return (error);
994 }
995 
996 /*
997  * Link a rule with all the subjects it applies to.
998  */
999 int
1000 rctl_rule_add(struct rctl_rule *rule)
1001 {
1002 	struct proc *p;
1003 	struct ucred *cred;
1004 	struct uidinfo *uip;
1005 	struct prison *pr;
1006 	struct prison_racct *prr;
1007 	struct loginclass *lc;
1008 	struct rctl_rule *rule2;
1009 	int match;
1010 
1011 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1012 
1013 	/*
1014 	 * Some rules just don't make sense.  Note that the one below
1015 	 * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
1016 	 * for example, is not deniable in the racct sense, but the
1017 	 * limit is enforced in a different way, so "deny" rules for %CPU
1018 	 * do make sense.
1019 	 */
1020 	if (rule->rr_action == RCTL_ACTION_DENY &&
1021 	    (rule->rr_resource == RACCT_CPU ||
1022 	    rule->rr_resource == RACCT_WALLCLOCK))
1023 		return (EOPNOTSUPP);
1024 
1025 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1026 	    RACCT_IS_SLOPPY(rule->rr_resource))
1027 		return (EOPNOTSUPP);
1028 
1029 	/*
1030 	 * Make sure there are no duplicated rules.  Also, for the "deny"
1031 	 * rules, remove ones differing only by "amount".
1032 	 */
1033 	if (rule->rr_action == RCTL_ACTION_DENY) {
1034 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1035 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1036 		rctl_rule_remove(rule2);
1037 		rctl_rule_release(rule2);
1038 	} else
1039 		rctl_rule_remove(rule);
1040 
1041 	switch (rule->rr_subject_type) {
1042 	case RCTL_SUBJECT_TYPE_PROCESS:
1043 		p = rule->rr_subject.rs_proc;
1044 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1045 
1046 		rctl_racct_add_rule(p->p_racct, rule);
1047 		/*
1048 		 * In case of per-process rule, we don't have anything more
1049 		 * to do.
1050 		 */
1051 		return (0);
1052 
1053 	case RCTL_SUBJECT_TYPE_USER:
1054 		uip = rule->rr_subject.rs_uip;
1055 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1056 		rctl_racct_add_rule(uip->ui_racct, rule);
1057 		break;
1058 
1059 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1060 		lc = rule->rr_subject.rs_loginclass;
1061 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1062 		rctl_racct_add_rule(lc->lc_racct, rule);
1063 		break;
1064 
1065 	case RCTL_SUBJECT_TYPE_JAIL:
1066 		prr = rule->rr_subject.rs_prison_racct;
1067 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1068 		rctl_racct_add_rule(prr->prr_racct, rule);
1069 		break;
1070 
1071 	default:
1072 		panic("rctl_rule_add: unknown subject type %d",
1073 		    rule->rr_subject_type);
1074 	}
1075 
1076 	/*
1077 	 * Now go through all the processes and add the new rule to the ones
1078 	 * it applies to.
1079 	 */
1080 	sx_assert(&allproc_lock, SA_LOCKED);
1081 	FOREACH_PROC_IN_SYSTEM(p) {
1082 		cred = p->p_ucred;
1083 		switch (rule->rr_subject_type) {
1084 		case RCTL_SUBJECT_TYPE_USER:
1085 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1086 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1087 				break;
1088 			continue;
1089 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1090 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1091 				break;
1092 			continue;
1093 		case RCTL_SUBJECT_TYPE_JAIL:
1094 			match = 0;
1095 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1096 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1097 					match = 1;
1098 					break;
1099 				}
1100 			}
1101 			if (match)
1102 				break;
1103 			continue;
1104 		default:
1105 			panic("rctl_rule_add: unknown subject type %d",
1106 			    rule->rr_subject_type);
1107 		}
1108 
1109 		rctl_racct_add_rule(p->p_racct, rule);
1110 	}
1111 
1112 	return (0);
1113 }
1114 
1115 static void
1116 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1117 {
1118 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1119 	int found = 0;
1120 
1121 	rw_wlock(&rctl_lock);
1122 	found += rctl_racct_remove_rules(racct, filter);
1123 	rw_wunlock(&rctl_lock);
1124 
1125 	*((int *)arg3) += found;
1126 }
1127 
1128 /*
1129  * Remove all rules that match the filter.
1130  */
1131 int
1132 rctl_rule_remove(struct rctl_rule *filter)
1133 {
1134 	int found = 0;
1135 	struct proc *p;
1136 
1137 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1138 	    filter->rr_subject.rs_proc != NULL) {
1139 		p = filter->rr_subject.rs_proc;
1140 		rw_wlock(&rctl_lock);
1141 		found = rctl_racct_remove_rules(p->p_racct, filter);
1142 		rw_wunlock(&rctl_lock);
1143 		if (found)
1144 			return (0);
1145 		return (ESRCH);
1146 	}
1147 
1148 	loginclass_racct_foreach(rctl_rule_remove_callback, filter,
1149 	    (void *)&found);
1150 	ui_racct_foreach(rctl_rule_remove_callback, filter,
1151 	    (void *)&found);
1152 	prison_racct_foreach(rctl_rule_remove_callback, filter,
1153 	    (void *)&found);
1154 
1155 	sx_assert(&allproc_lock, SA_LOCKED);
1156 	rw_wlock(&rctl_lock);
1157 	FOREACH_PROC_IN_SYSTEM(p) {
1158 		found += rctl_racct_remove_rules(p->p_racct, filter);
1159 	}
1160 	rw_wunlock(&rctl_lock);
1161 
1162 	if (found)
1163 		return (0);
1164 	return (ESRCH);
1165 }
1166 
1167 /*
1168  * Appends a rule to the sbuf.
1169  */
1170 static void
1171 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1172 {
1173 	int64_t amount;
1174 
1175 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1176 
1177 	switch (rule->rr_subject_type) {
1178 	case RCTL_SUBJECT_TYPE_PROCESS:
1179 		if (rule->rr_subject.rs_proc == NULL)
1180 			sbuf_printf(sb, ":");
1181 		else
1182 			sbuf_printf(sb, "%d:",
1183 			    rule->rr_subject.rs_proc->p_pid);
1184 		break;
1185 	case RCTL_SUBJECT_TYPE_USER:
1186 		if (rule->rr_subject.rs_uip == NULL)
1187 			sbuf_printf(sb, ":");
1188 		else
1189 			sbuf_printf(sb, "%d:",
1190 			    rule->rr_subject.rs_uip->ui_uid);
1191 		break;
1192 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1193 		if (rule->rr_subject.rs_loginclass == NULL)
1194 			sbuf_printf(sb, ":");
1195 		else
1196 			sbuf_printf(sb, "%s:",
1197 			    rule->rr_subject.rs_loginclass->lc_name);
1198 		break;
1199 	case RCTL_SUBJECT_TYPE_JAIL:
1200 		if (rule->rr_subject.rs_prison_racct == NULL)
1201 			sbuf_printf(sb, ":");
1202 		else
1203 			sbuf_printf(sb, "%s:",
1204 			    rule->rr_subject.rs_prison_racct->prr_name);
1205 		break;
1206 	default:
1207 		panic("rctl_rule_to_sbuf: unknown subject type %d",
1208 		    rule->rr_subject_type);
1209 	}
1210 
1211 	amount = rule->rr_amount;
1212 	if (amount != RCTL_AMOUNT_UNDEFINED &&
1213 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1214 		amount /= 1000000;
1215 
1216 	sbuf_printf(sb, "%s:%s=%jd",
1217 	    rctl_resource_name(rule->rr_resource),
1218 	    rctl_action_name(rule->rr_action),
1219 	    amount);
1220 
1221 	if (rule->rr_per != rule->rr_subject_type)
1222 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1223 }
1224 
1225 /*
1226  * Routine used by RCTL syscalls to read in input string.
1227  */
1228 static int
1229 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1230 {
1231 	int error;
1232 	char *str;
1233 
1234 	if (inbuflen <= 0)
1235 		return (EINVAL);
1236 	if (inbuflen > RCTL_MAX_INBUFLEN)
1237 		return (E2BIG);
1238 
1239 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1240 	error = copyinstr(inbufp, str, inbuflen, NULL);
1241 	if (error != 0) {
1242 		free(str, M_RCTL);
1243 		return (error);
1244 	}
1245 
1246 	*inputstr = str;
1247 
1248 	return (0);
1249 }
1250 
1251 /*
1252  * Routine used by RCTL syscalls to write out output string.
1253  */
1254 static int
1255 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1256 {
1257 	int error;
1258 
1259 	if (outputsbuf == NULL)
1260 		return (0);
1261 
1262 	sbuf_finish(outputsbuf);
1263 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1264 		sbuf_delete(outputsbuf);
1265 		return (ERANGE);
1266 	}
1267 	error = copyout(sbuf_data(outputsbuf), outbufp,
1268 	    sbuf_len(outputsbuf) + 1);
1269 	sbuf_delete(outputsbuf);
1270 	return (error);
1271 }
1272 
1273 static struct sbuf *
1274 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1275 {
1276 	int i;
1277 	int64_t amount;
1278 	struct sbuf *sb;
1279 
1280 	sb = sbuf_new_auto();
1281 	for (i = 0; i <= RACCT_MAX; i++) {
1282 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1283 			continue;
1284 		amount = racct->r_resources[i];
1285 		if (RACCT_IS_IN_MILLIONS(i))
1286 			amount /= 1000000;
1287 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1288 	}
1289 	sbuf_setpos(sb, sbuf_len(sb) - 1);
1290 	return (sb);
1291 }
1292 
1293 int
1294 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1295 {
1296 	int error;
1297 	char *inputstr;
1298 	struct rctl_rule *filter;
1299 	struct sbuf *outputsbuf = NULL;
1300 	struct proc *p;
1301 	struct uidinfo *uip;
1302 	struct loginclass *lc;
1303 	struct prison_racct *prr;
1304 
1305 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1306 	if (error != 0)
1307 		return (error);
1308 
1309 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1310 	if (error != 0)
1311 		return (error);
1312 
1313 	sx_slock(&allproc_lock);
1314 	error = rctl_string_to_rule(inputstr, &filter);
1315 	free(inputstr, M_RCTL);
1316 	if (error != 0) {
1317 		sx_sunlock(&allproc_lock);
1318 		return (error);
1319 	}
1320 
1321 	switch (filter->rr_subject_type) {
1322 	case RCTL_SUBJECT_TYPE_PROCESS:
1323 		p = filter->rr_subject.rs_proc;
1324 		if (p == NULL) {
1325 			error = EINVAL;
1326 			goto out;
1327 		}
1328 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1329 		break;
1330 	case RCTL_SUBJECT_TYPE_USER:
1331 		uip = filter->rr_subject.rs_uip;
1332 		if (uip == NULL) {
1333 			error = EINVAL;
1334 			goto out;
1335 		}
1336 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1337 		break;
1338 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1339 		lc = filter->rr_subject.rs_loginclass;
1340 		if (lc == NULL) {
1341 			error = EINVAL;
1342 			goto out;
1343 		}
1344 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1345 		break;
1346 	case RCTL_SUBJECT_TYPE_JAIL:
1347 		prr = filter->rr_subject.rs_prison_racct;
1348 		if (prr == NULL) {
1349 			error = EINVAL;
1350 			goto out;
1351 		}
1352 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1353 		break;
1354 	default:
1355 		error = EINVAL;
1356 	}
1357 out:
1358 	rctl_rule_release(filter);
1359 	sx_sunlock(&allproc_lock);
1360 	if (error != 0)
1361 		return (error);
1362 
1363 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1364 
1365 	return (error);
1366 }
1367 
1368 static void
1369 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1370 {
1371 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1372 	struct rctl_rule_link *link;
1373 	struct sbuf *sb = (struct sbuf *)arg3;
1374 
1375 	rw_rlock(&rctl_lock);
1376 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1377 		if (!rctl_rule_matches(link->rrl_rule, filter))
1378 			continue;
1379 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1380 		sbuf_printf(sb, ",");
1381 	}
1382 	rw_runlock(&rctl_lock);
1383 }
1384 
1385 int
1386 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1387 {
1388 	int error;
1389 	size_t bufsize = RCTL_DEFAULT_BUFSIZE;
1390 	char *inputstr, *buf;
1391 	struct sbuf *sb;
1392 	struct rctl_rule *filter;
1393 	struct rctl_rule_link *link;
1394 	struct proc *p;
1395 
1396 	error = priv_check(td, PRIV_RCTL_GET_RULES);
1397 	if (error != 0)
1398 		return (error);
1399 
1400 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1401 	if (error != 0)
1402 		return (error);
1403 
1404 	sx_slock(&allproc_lock);
1405 	error = rctl_string_to_rule(inputstr, &filter);
1406 	free(inputstr, M_RCTL);
1407 	if (error != 0) {
1408 		sx_sunlock(&allproc_lock);
1409 		return (error);
1410 	}
1411 
1412 again:
1413 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1414 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1415 	KASSERT(sb != NULL, ("sbuf_new failed"));
1416 
1417 	sx_assert(&allproc_lock, SA_LOCKED);
1418 	FOREACH_PROC_IN_SYSTEM(p) {
1419 		rw_rlock(&rctl_lock);
1420 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1421 			/*
1422 			 * Non-process rules will be added to the buffer later.
1423 			 * Adding them here would result in duplicated output.
1424 			 */
1425 			if (link->rrl_rule->rr_subject_type !=
1426 			    RCTL_SUBJECT_TYPE_PROCESS)
1427 				continue;
1428 			if (!rctl_rule_matches(link->rrl_rule, filter))
1429 				continue;
1430 			rctl_rule_to_sbuf(sb, link->rrl_rule);
1431 			sbuf_printf(sb, ",");
1432 		}
1433 		rw_runlock(&rctl_lock);
1434 	}
1435 
1436 	loginclass_racct_foreach(rctl_get_rules_callback, filter, sb);
1437 	ui_racct_foreach(rctl_get_rules_callback, filter, sb);
1438 	prison_racct_foreach(rctl_get_rules_callback, filter, sb);
1439 	if (sbuf_error(sb) == ENOMEM) {
1440 		sbuf_delete(sb);
1441 		free(buf, M_RCTL);
1442 		bufsize *= 4;
1443 		goto again;
1444 	}
1445 
1446 	/*
1447 	 * Remove trailing ",".
1448 	 */
1449 	if (sbuf_len(sb) > 0)
1450 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1451 
1452 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1453 
1454 	rctl_rule_release(filter);
1455 	sx_sunlock(&allproc_lock);
1456 	free(buf, M_RCTL);
1457 	return (error);
1458 }
1459 
1460 int
1461 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1462 {
1463 	int error;
1464 	size_t bufsize = RCTL_DEFAULT_BUFSIZE;
1465 	char *inputstr, *buf;
1466 	struct sbuf *sb;
1467 	struct rctl_rule *filter;
1468 	struct rctl_rule_link *link;
1469 
1470 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1471 	if (error != 0)
1472 		return (error);
1473 
1474 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1475 	if (error != 0)
1476 		return (error);
1477 
1478 	sx_slock(&allproc_lock);
1479 	error = rctl_string_to_rule(inputstr, &filter);
1480 	free(inputstr, M_RCTL);
1481 	if (error != 0) {
1482 		sx_sunlock(&allproc_lock);
1483 		return (error);
1484 	}
1485 
1486 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1487 		rctl_rule_release(filter);
1488 		sx_sunlock(&allproc_lock);
1489 		return (EINVAL);
1490 	}
1491 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1492 		rctl_rule_release(filter);
1493 		sx_sunlock(&allproc_lock);
1494 		return (EOPNOTSUPP);
1495 	}
1496 	if (filter->rr_subject.rs_proc == NULL) {
1497 		rctl_rule_release(filter);
1498 		sx_sunlock(&allproc_lock);
1499 		return (EINVAL);
1500 	}
1501 
1502 again:
1503 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1504 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1505 	KASSERT(sb != NULL, ("sbuf_new failed"));
1506 
1507 	rw_rlock(&rctl_lock);
1508 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1509 	    rrl_next) {
1510 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1511 		sbuf_printf(sb, ",");
1512 	}
1513 	rw_runlock(&rctl_lock);
1514 	if (sbuf_error(sb) == ENOMEM) {
1515 		sbuf_delete(sb);
1516 		free(buf, M_RCTL);
1517 		bufsize *= 4;
1518 		goto again;
1519 	}
1520 
1521 	/*
1522 	 * Remove trailing ",".
1523 	 */
1524 	if (sbuf_len(sb) > 0)
1525 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1526 
1527 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1528 	rctl_rule_release(filter);
1529 	sx_sunlock(&allproc_lock);
1530 	free(buf, M_RCTL);
1531 	return (error);
1532 }
1533 
1534 int
1535 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1536 {
1537 	int error;
1538 	struct rctl_rule *rule;
1539 	char *inputstr;
1540 
1541 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1542 	if (error != 0)
1543 		return (error);
1544 
1545 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1546 	if (error != 0)
1547 		return (error);
1548 
1549 	sx_slock(&allproc_lock);
1550 	error = rctl_string_to_rule(inputstr, &rule);
1551 	free(inputstr, M_RCTL);
1552 	if (error != 0) {
1553 		sx_sunlock(&allproc_lock);
1554 		return (error);
1555 	}
1556 	/*
1557 	 * The 'per' part of a rule is optional.
1558 	 */
1559 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1560 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1561 		rule->rr_per = rule->rr_subject_type;
1562 
1563 	if (!rctl_rule_fully_specified(rule)) {
1564 		error = EINVAL;
1565 		goto out;
1566 	}
1567 
1568 	error = rctl_rule_add(rule);
1569 
1570 out:
1571 	rctl_rule_release(rule);
1572 	sx_sunlock(&allproc_lock);
1573 	return (error);
1574 }
1575 
1576 int
1577 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1578 {
1579 	int error;
1580 	struct rctl_rule *filter;
1581 	char *inputstr;
1582 
1583 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1584 	if (error != 0)
1585 		return (error);
1586 
1587 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1588 	if (error != 0)
1589 		return (error);
1590 
1591 	sx_slock(&allproc_lock);
1592 	error = rctl_string_to_rule(inputstr, &filter);
1593 	free(inputstr, M_RCTL);
1594 	if (error != 0) {
1595 		sx_sunlock(&allproc_lock);
1596 		return (error);
1597 	}
1598 
1599 	error = rctl_rule_remove(filter);
1600 	rctl_rule_release(filter);
1601 	sx_sunlock(&allproc_lock);
1602 
1603 	return (error);
1604 }
1605 
1606 /*
1607  * Update RCTL rule list after credential change.
1608  */
1609 void
1610 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1611 {
1612 	int rulecnt, i;
1613 	struct rctl_rule_link *link, *newlink;
1614 	struct uidinfo *newuip;
1615 	struct loginclass *newlc;
1616 	struct prison_racct *newprr;
1617 	LIST_HEAD(, rctl_rule_link) newrules;
1618 
1619 	newuip = newcred->cr_ruidinfo;
1620 	newlc = newcred->cr_loginclass;
1621 	newprr = newcred->cr_prison->pr_prison_racct;
1622 
1623 	LIST_INIT(&newrules);
1624 
1625 again:
1626 	/*
1627 	 * First, count the rules that apply to the process with new
1628 	 * credentials.
1629 	 */
1630 	rulecnt = 0;
1631 	rw_rlock(&rctl_lock);
1632 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1633 		if (link->rrl_rule->rr_subject_type ==
1634 		    RCTL_SUBJECT_TYPE_PROCESS)
1635 			rulecnt++;
1636 	}
1637 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1638 		rulecnt++;
1639 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1640 		rulecnt++;
1641 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1642 		rulecnt++;
1643 	rw_runlock(&rctl_lock);
1644 
1645 	/*
1646 	 * Create temporary list.  We've dropped the rctl_lock in order
1647 	 * to use M_WAITOK.
1648 	 */
1649 	for (i = 0; i < rulecnt; i++) {
1650 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
1651 		newlink->rrl_rule = NULL;
1652 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
1653 	}
1654 
1655 	newlink = LIST_FIRST(&newrules);
1656 
1657 	/*
1658 	 * Assign rules to the newly allocated list entries.
1659 	 */
1660 	rw_wlock(&rctl_lock);
1661 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1662 		if (link->rrl_rule->rr_subject_type ==
1663 		    RCTL_SUBJECT_TYPE_PROCESS) {
1664 			if (newlink == NULL)
1665 				goto goaround;
1666 			rctl_rule_acquire(link->rrl_rule);
1667 			newlink->rrl_rule = link->rrl_rule;
1668 			newlink = LIST_NEXT(newlink, rrl_next);
1669 			rulecnt--;
1670 		}
1671 	}
1672 
1673 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
1674 		if (newlink == NULL)
1675 			goto goaround;
1676 		rctl_rule_acquire(link->rrl_rule);
1677 		newlink->rrl_rule = link->rrl_rule;
1678 		newlink = LIST_NEXT(newlink, rrl_next);
1679 		rulecnt--;
1680 	}
1681 
1682 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
1683 		if (newlink == NULL)
1684 			goto goaround;
1685 		rctl_rule_acquire(link->rrl_rule);
1686 		newlink->rrl_rule = link->rrl_rule;
1687 		newlink = LIST_NEXT(newlink, rrl_next);
1688 		rulecnt--;
1689 	}
1690 
1691 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
1692 		if (newlink == NULL)
1693 			goto goaround;
1694 		rctl_rule_acquire(link->rrl_rule);
1695 		newlink->rrl_rule = link->rrl_rule;
1696 		newlink = LIST_NEXT(newlink, rrl_next);
1697 		rulecnt--;
1698 	}
1699 
1700 	if (rulecnt == 0) {
1701 		/*
1702 		 * Free the old rule list.
1703 		 */
1704 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
1705 			link = LIST_FIRST(&p->p_racct->r_rule_links);
1706 			LIST_REMOVE(link, rrl_next);
1707 			rctl_rule_release(link->rrl_rule);
1708 			uma_zfree(rctl_rule_link_zone, link);
1709 		}
1710 
1711 		/*
1712 		 * Replace lists and we're done.
1713 		 *
1714 		 * XXX: Is there any way to switch list heads instead
1715 		 *      of iterating here?
1716 		 */
1717 		while (!LIST_EMPTY(&newrules)) {
1718 			newlink = LIST_FIRST(&newrules);
1719 			LIST_REMOVE(newlink, rrl_next);
1720 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
1721 			    newlink, rrl_next);
1722 		}
1723 
1724 		rw_wunlock(&rctl_lock);
1725 
1726 		return;
1727 	}
1728 
1729 goaround:
1730 	rw_wunlock(&rctl_lock);
1731 
1732 	/*
1733 	 * Rule list changed while we were not holding the rctl_lock.
1734 	 * Free the new list and try again.
1735 	 */
1736 	while (!LIST_EMPTY(&newrules)) {
1737 		newlink = LIST_FIRST(&newrules);
1738 		LIST_REMOVE(newlink, rrl_next);
1739 		if (newlink->rrl_rule != NULL)
1740 			rctl_rule_release(newlink->rrl_rule);
1741 		uma_zfree(rctl_rule_link_zone, newlink);
1742 	}
1743 
1744 	goto again;
1745 }
1746 
1747 /*
1748  * Assign RCTL rules to the newly created process.
1749  */
1750 int
1751 rctl_proc_fork(struct proc *parent, struct proc *child)
1752 {
1753 	int error;
1754 	struct rctl_rule_link *link;
1755 	struct rctl_rule *rule;
1756 
1757 	LIST_INIT(&child->p_racct->r_rule_links);
1758 
1759 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
1760 
1761 	rw_wlock(&rctl_lock);
1762 
1763 	/*
1764 	 * Go through limits applicable to the parent and assign them
1765 	 * to the child.  Rules with 'process' subject have to be duplicated
1766 	 * in order to make their rr_subject point to the new process.
1767 	 */
1768 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
1769 		if (link->rrl_rule->rr_subject_type ==
1770 		    RCTL_SUBJECT_TYPE_PROCESS) {
1771 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
1772 			if (rule == NULL)
1773 				goto fail;
1774 			KASSERT(rule->rr_subject.rs_proc == parent,
1775 			    ("rule->rr_subject.rs_proc != parent"));
1776 			rule->rr_subject.rs_proc = child;
1777 			error = rctl_racct_add_rule_locked(child->p_racct,
1778 			    rule);
1779 			rctl_rule_release(rule);
1780 			if (error != 0)
1781 				goto fail;
1782 		} else {
1783 			error = rctl_racct_add_rule_locked(child->p_racct,
1784 			    link->rrl_rule);
1785 			if (error != 0)
1786 				goto fail;
1787 		}
1788 	}
1789 
1790 	rw_wunlock(&rctl_lock);
1791 	return (0);
1792 
1793 fail:
1794 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
1795 		link = LIST_FIRST(&child->p_racct->r_rule_links);
1796 		LIST_REMOVE(link, rrl_next);
1797 		rctl_rule_release(link->rrl_rule);
1798 		uma_zfree(rctl_rule_link_zone, link);
1799 	}
1800 	rw_wunlock(&rctl_lock);
1801 	return (EAGAIN);
1802 }
1803 
1804 /*
1805  * Release rules attached to the racct.
1806  */
1807 void
1808 rctl_racct_release(struct racct *racct)
1809 {
1810 	struct rctl_rule_link *link;
1811 
1812 	rw_wlock(&rctl_lock);
1813 	while (!LIST_EMPTY(&racct->r_rule_links)) {
1814 		link = LIST_FIRST(&racct->r_rule_links);
1815 		LIST_REMOVE(link, rrl_next);
1816 		rctl_rule_release(link->rrl_rule);
1817 		uma_zfree(rctl_rule_link_zone, link);
1818 	}
1819 	rw_wunlock(&rctl_lock);
1820 }
1821 
1822 static void
1823 rctl_init(void)
1824 {
1825 
1826 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
1827 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
1828 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1829 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
1830 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1831 }
1832 
1833 #else /* !RCTL */
1834 
1835 int
1836 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1837 {
1838 
1839 	return (ENOSYS);
1840 }
1841 
1842 int
1843 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1844 {
1845 
1846 	return (ENOSYS);
1847 }
1848 
1849 int
1850 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1851 {
1852 
1853 	return (ENOSYS);
1854 }
1855 
1856 int
1857 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1858 {
1859 
1860 	return (ENOSYS);
1861 }
1862 
1863 int
1864 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1865 {
1866 
1867 	return (ENOSYS);
1868 }
1869 
1870 #endif /* !RCTL */
1871