xref: /freebsd/sys/kern/kern_rctl.c (revision 907b59d76938e654f0d040a888e8dfca3de1e222)
1 /*-
2  * Copyright (c) 2010 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/malloc.h>
38 #include <sys/queue.h>
39 #include <sys/refcount.h>
40 #include <sys/jail.h>
41 #include <sys/kernel.h>
42 #include <sys/limits.h>
43 #include <sys/loginclass.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/racct.h>
47 #include <sys/rctl.h>
48 #include <sys/resourcevar.h>
49 #include <sys/sx.h>
50 #include <sys/sysent.h>
51 #include <sys/sysproto.h>
52 #include <sys/systm.h>
53 #include <sys/types.h>
54 #include <sys/eventhandler.h>
55 #include <sys/lock.h>
56 #include <sys/mutex.h>
57 #include <sys/rwlock.h>
58 #include <sys/sbuf.h>
59 #include <sys/taskqueue.h>
60 #include <sys/tree.h>
61 #include <vm/uma.h>
62 
63 #ifdef RCTL
64 #ifndef RACCT
65 #error "The RCTL option requires the RACCT option"
66 #endif
67 
68 FEATURE(rctl, "Resource Limits");
69 
70 #define	HRF_DEFAULT		0
71 #define	HRF_DONT_INHERIT	1
72 #define	HRF_DONT_ACCUMULATE	2
73 
74 #define	RCTL_MAX_INBUFSIZE	4 * 1024
75 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
76 #define	RCTL_LOG_BUFSIZE	128
77 
78 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
79 
80 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81 static int rctl_log_rate_limit = 10;
82 static int rctl_devctl_rate_limit = 10;
83 
84 /*
85  * Values below are initialized in rctl_init().
86  */
87 static int rctl_throttle_min = -1;
88 static int rctl_throttle_max = -1;
89 static int rctl_throttle_pct = -1;
90 static int rctl_throttle_pct2 = -1;
91 
92 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96 
97 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
98 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
99     &rctl_maxbufsize, 0, "Maximum output buffer size");
100 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
101     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
102 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
103     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
104 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
105     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
106     "Shortest throttling duration, in hz");
107 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
108 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
109     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
110     "Longest throttling duration, in hz");
111 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
112 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
113     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
114     "Throttling penalty for process consumption, in percent");
115 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
116 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
117     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
118     "Throttling penalty for container consumption, in percent");
119 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
120 
121 /*
122  * 'rctl_rule_link' connects a rule with every racct it's related to.
123  * For example, rule 'user:X:openfiles:deny=N/process' is linked
124  * with uidinfo for user X, and to each process of that user.
125  */
126 struct rctl_rule_link {
127 	LIST_ENTRY(rctl_rule_link)	rrl_next;
128 	struct rctl_rule		*rrl_rule;
129 	int				rrl_exceeded;
130 };
131 
132 struct dict {
133 	const char	*d_name;
134 	int		d_value;
135 };
136 
137 static struct dict subjectnames[] = {
138 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
139 	{ "user", RCTL_SUBJECT_TYPE_USER },
140 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
141 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
142 	{ NULL, -1 }};
143 
144 static struct dict resourcenames[] = {
145 	{ "cputime", RACCT_CPU },
146 	{ "datasize", RACCT_DATA },
147 	{ "stacksize", RACCT_STACK },
148 	{ "coredumpsize", RACCT_CORE },
149 	{ "memoryuse", RACCT_RSS },
150 	{ "memorylocked", RACCT_MEMLOCK },
151 	{ "maxproc", RACCT_NPROC },
152 	{ "openfiles", RACCT_NOFILE },
153 	{ "vmemoryuse", RACCT_VMEM },
154 	{ "pseudoterminals", RACCT_NPTS },
155 	{ "swapuse", RACCT_SWAP },
156 	{ "nthr", RACCT_NTHR },
157 	{ "msgqqueued", RACCT_MSGQQUEUED },
158 	{ "msgqsize", RACCT_MSGQSIZE },
159 	{ "nmsgq", RACCT_NMSGQ },
160 	{ "nsem", RACCT_NSEM },
161 	{ "nsemop", RACCT_NSEMOP },
162 	{ "nshm", RACCT_NSHM },
163 	{ "shmsize", RACCT_SHMSIZE },
164 	{ "wallclock", RACCT_WALLCLOCK },
165 	{ "pcpu", RACCT_PCTCPU },
166 	{ "readbps", RACCT_READBPS },
167 	{ "writebps", RACCT_WRITEBPS },
168 	{ "readiops", RACCT_READIOPS },
169 	{ "writeiops", RACCT_WRITEIOPS },
170 	{ NULL, -1 }};
171 
172 static struct dict actionnames[] = {
173 	{ "sighup", RCTL_ACTION_SIGHUP },
174 	{ "sigint", RCTL_ACTION_SIGINT },
175 	{ "sigquit", RCTL_ACTION_SIGQUIT },
176 	{ "sigill", RCTL_ACTION_SIGILL },
177 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
178 	{ "sigabrt", RCTL_ACTION_SIGABRT },
179 	{ "sigemt", RCTL_ACTION_SIGEMT },
180 	{ "sigfpe", RCTL_ACTION_SIGFPE },
181 	{ "sigkill", RCTL_ACTION_SIGKILL },
182 	{ "sigbus", RCTL_ACTION_SIGBUS },
183 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
184 	{ "sigsys", RCTL_ACTION_SIGSYS },
185 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
186 	{ "sigalrm", RCTL_ACTION_SIGALRM },
187 	{ "sigterm", RCTL_ACTION_SIGTERM },
188 	{ "sigurg", RCTL_ACTION_SIGURG },
189 	{ "sigstop", RCTL_ACTION_SIGSTOP },
190 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
191 	{ "sigchld", RCTL_ACTION_SIGCHLD },
192 	{ "sigttin", RCTL_ACTION_SIGTTIN },
193 	{ "sigttou", RCTL_ACTION_SIGTTOU },
194 	{ "sigio", RCTL_ACTION_SIGIO },
195 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
196 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
197 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
198 	{ "sigprof", RCTL_ACTION_SIGPROF },
199 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
200 	{ "siginfo", RCTL_ACTION_SIGINFO },
201 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
202 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
203 	{ "sigthr", RCTL_ACTION_SIGTHR },
204 	{ "deny", RCTL_ACTION_DENY },
205 	{ "log", RCTL_ACTION_LOG },
206 	{ "devctl", RCTL_ACTION_DEVCTL },
207 	{ "throttle", RCTL_ACTION_THROTTLE },
208 	{ NULL, -1 }};
209 
210 static void rctl_init(void);
211 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
212 
213 static uma_zone_t rctl_rule_zone;
214 static uma_zone_t rctl_rule_link_zone;
215 
216 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
217 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
218 
219 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
220 
221 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
222 {
223 	int error, val = rctl_throttle_min;
224 
225 	error = sysctl_handle_int(oidp, &val, 0, req);
226 	if (error || !req->newptr)
227 		return (error);
228 	if (val < 1 || val > rctl_throttle_max)
229 		return (EINVAL);
230 
231 	RACCT_LOCK();
232 	rctl_throttle_min = val;
233 	RACCT_UNLOCK();
234 
235 	return (0);
236 }
237 
238 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
239 {
240 	int error, val = rctl_throttle_max;
241 
242 	error = sysctl_handle_int(oidp, &val, 0, req);
243 	if (error || !req->newptr)
244 		return (error);
245 	if (val < rctl_throttle_min)
246 		return (EINVAL);
247 
248 	RACCT_LOCK();
249 	rctl_throttle_max = val;
250 	RACCT_UNLOCK();
251 
252 	return (0);
253 }
254 
255 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
256 {
257 	int error, val = rctl_throttle_pct;
258 
259 	error = sysctl_handle_int(oidp, &val, 0, req);
260 	if (error || !req->newptr)
261 		return (error);
262 	if (val < 0)
263 		return (EINVAL);
264 
265 	RACCT_LOCK();
266 	rctl_throttle_pct = val;
267 	RACCT_UNLOCK();
268 
269 	return (0);
270 }
271 
272 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
273 {
274 	int error, val = rctl_throttle_pct2;
275 
276 	error = sysctl_handle_int(oidp, &val, 0, req);
277 	if (error || !req->newptr)
278 		return (error);
279 	if (val < 0)
280 		return (EINVAL);
281 
282 	RACCT_LOCK();
283 	rctl_throttle_pct2 = val;
284 	RACCT_UNLOCK();
285 
286 	return (0);
287 }
288 
289 static const char *
290 rctl_subject_type_name(int subject)
291 {
292 	int i;
293 
294 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
295 		if (subjectnames[i].d_value == subject)
296 			return (subjectnames[i].d_name);
297 	}
298 
299 	panic("rctl_subject_type_name: unknown subject type %d", subject);
300 }
301 
302 static const char *
303 rctl_action_name(int action)
304 {
305 	int i;
306 
307 	for (i = 0; actionnames[i].d_name != NULL; i++) {
308 		if (actionnames[i].d_value == action)
309 			return (actionnames[i].d_name);
310 	}
311 
312 	panic("rctl_action_name: unknown action %d", action);
313 }
314 
315 const char *
316 rctl_resource_name(int resource)
317 {
318 	int i;
319 
320 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
321 		if (resourcenames[i].d_value == resource)
322 			return (resourcenames[i].d_name);
323 	}
324 
325 	panic("rctl_resource_name: unknown resource %d", resource);
326 }
327 
328 static struct racct *
329 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
330 {
331 	struct ucred *cred = p->p_ucred;
332 
333 	ASSERT_RACCT_ENABLED();
334 	RACCT_LOCK_ASSERT();
335 
336 	switch (rule->rr_per) {
337 	case RCTL_SUBJECT_TYPE_PROCESS:
338 		return (p->p_racct);
339 	case RCTL_SUBJECT_TYPE_USER:
340 		return (cred->cr_ruidinfo->ui_racct);
341 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
342 		return (cred->cr_loginclass->lc_racct);
343 	case RCTL_SUBJECT_TYPE_JAIL:
344 		return (cred->cr_prison->pr_prison_racct->prr_racct);
345 	default:
346 		panic("%s: unknown per %d", __func__, rule->rr_per);
347 	}
348 }
349 
350 /*
351  * Return the amount of resource that can be allocated by 'p' before
352  * hitting 'rule'.
353  */
354 static int64_t
355 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
356 {
357 	const struct racct *racct;
358 	int64_t available;
359 
360 	ASSERT_RACCT_ENABLED();
361 	RACCT_LOCK_ASSERT();
362 
363 	racct = rctl_proc_rule_to_racct(p, rule);
364 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
365 
366 	return (available);
367 }
368 
369 /*
370  * Called every second for proc, uidinfo, loginclass, and jail containers.
371  * If the limit isn't exceeded, it decreases the usage amount to zero.
372  * Otherwise, it decreases it by the value of the limit.  This way
373  * resource consumption exceeding the limit "carries over" to the next
374  * period.
375  */
376 void
377 rctl_throttle_decay(struct racct *racct, int resource)
378 {
379 	struct rctl_rule *rule;
380 	struct rctl_rule_link *link;
381 	int64_t minavailable;
382 
383 	ASSERT_RACCT_ENABLED();
384 	RACCT_LOCK_ASSERT();
385 
386 	minavailable = INT64_MAX;
387 
388 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
389 		rule = link->rrl_rule;
390 
391 		if (rule->rr_resource != resource)
392 			continue;
393 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
394 			continue;
395 
396 		if (rule->rr_amount < minavailable)
397 			minavailable = rule->rr_amount;
398 	}
399 
400 	if (racct->r_resources[resource] < minavailable) {
401 		racct->r_resources[resource] = 0;
402 	} else {
403 		/*
404 		 * Cap utilization counter at ten times the limit.  Otherwise,
405 		 * if we changed the rule lowering the allowed amount, it could
406 		 * take unreasonably long time for the accumulated resource
407 		 * usage to drop.
408 		 */
409 		if (racct->r_resources[resource] > minavailable * 10)
410 			racct->r_resources[resource] = minavailable * 10;
411 
412 		racct->r_resources[resource] -= minavailable;
413 	}
414 }
415 
416 /*
417  * Special version of rctl_get_available() for the %CPU resource.
418  * We slightly cheat here and return less than we normally would.
419  */
420 int64_t
421 rctl_pcpu_available(const struct proc *p) {
422 	struct rctl_rule *rule;
423 	struct rctl_rule_link *link;
424 	int64_t available, minavailable, limit;
425 
426 	ASSERT_RACCT_ENABLED();
427 	RACCT_LOCK_ASSERT();
428 
429 	minavailable = INT64_MAX;
430 	limit = 0;
431 
432 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
433 		rule = link->rrl_rule;
434 		if (rule->rr_resource != RACCT_PCTCPU)
435 			continue;
436 		if (rule->rr_action != RCTL_ACTION_DENY)
437 			continue;
438 		available = rctl_available_resource(p, rule);
439 		if (available < minavailable) {
440 			minavailable = available;
441 			limit = rule->rr_amount;
442 		}
443 	}
444 
445 	/*
446 	 * Return slightly less than actual value of the available
447 	 * %cpu resource.  This makes %cpu throttling more aggressive
448 	 * and lets us act sooner than the limits are already exceeded.
449 	 */
450 	if (limit != 0) {
451 		if (limit > 2 * RCTL_PCPU_SHIFT)
452 			minavailable -= RCTL_PCPU_SHIFT;
453 		else
454 			minavailable -= (limit / 2);
455 	}
456 
457 	return (minavailable);
458 }
459 
460 static uint64_t
461 xadd(uint64_t a, uint64_t b)
462 {
463 	uint64_t c;
464 
465 	c = a + b;
466 
467 	/*
468 	 * Detect overflow.
469 	 */
470 	if (c < a || c < b)
471 		return (UINT64_MAX);
472 
473 	return (c);
474 }
475 
476 static uint64_t
477 xmul(uint64_t a, uint64_t b)
478 {
479 
480 	if (b != 0 && a > UINT64_MAX / b)
481 		return (UINT64_MAX);
482 
483 	return (a * b);
484 }
485 
486 /*
487  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
488  * to what it keeps allocated now.  Returns non-zero if the allocation should
489  * be denied, 0 otherwise.
490  */
491 int
492 rctl_enforce(struct proc *p, int resource, uint64_t amount)
493 {
494 	static struct timeval log_lasttime, devctl_lasttime;
495 	static int log_curtime = 0, devctl_curtime = 0;
496 	struct rctl_rule *rule;
497 	struct rctl_rule_link *link;
498 	struct sbuf sb;
499 	char *buf;
500 	int64_t available;
501 	uint64_t sleep_ms, sleep_ratio;
502 	int should_deny = 0;
503 
504 	ASSERT_RACCT_ENABLED();
505 	RACCT_LOCK_ASSERT();
506 
507 	/*
508 	 * There may be more than one matching rule; go through all of them.
509 	 * Denial should be done last, after logging and sending signals.
510 	 */
511 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
512 		rule = link->rrl_rule;
513 		if (rule->rr_resource != resource)
514 			continue;
515 
516 		available = rctl_available_resource(p, rule);
517 		if (available >= (int64_t)amount) {
518 			link->rrl_exceeded = 0;
519 			continue;
520 		}
521 
522 		switch (rule->rr_action) {
523 		case RCTL_ACTION_DENY:
524 			should_deny = 1;
525 			continue;
526 		case RCTL_ACTION_LOG:
527 			/*
528 			 * If rrl_exceeded != 0, it means we've already
529 			 * logged a warning for this process.
530 			 */
531 			if (link->rrl_exceeded != 0)
532 				continue;
533 
534 			/*
535 			 * If the process state is not fully initialized yet,
536 			 * we can't access most of the required fields, e.g.
537 			 * p->p_comm.  This happens when called from fork1().
538 			 * Ignore this rule for now; it will be processed just
539 			 * after fork, when called from racct_proc_fork_done().
540 			 */
541 			if (p->p_state != PRS_NORMAL)
542 				continue;
543 
544 			if (!ppsratecheck(&log_lasttime, &log_curtime,
545 			    rctl_log_rate_limit))
546 				continue;
547 
548 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
549 			if (buf == NULL) {
550 				printf("rctl_enforce: out of memory\n");
551 				continue;
552 			}
553 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
554 			rctl_rule_to_sbuf(&sb, rule);
555 			sbuf_finish(&sb);
556 			printf("rctl: rule \"%s\" matched by pid %d "
557 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
558 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
559 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
560 			sbuf_delete(&sb);
561 			free(buf, M_RCTL);
562 			link->rrl_exceeded = 1;
563 			continue;
564 		case RCTL_ACTION_DEVCTL:
565 			if (link->rrl_exceeded != 0)
566 				continue;
567 
568 			if (p->p_state != PRS_NORMAL)
569 				continue;
570 
571 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
572 			    rctl_devctl_rate_limit))
573 				continue;
574 
575 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
576 			if (buf == NULL) {
577 				printf("rctl_enforce: out of memory\n");
578 				continue;
579 			}
580 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
581 			sbuf_printf(&sb, "rule=");
582 			rctl_rule_to_sbuf(&sb, rule);
583 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
584 			    p->p_pid, p->p_ucred->cr_ruid,
585 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
586 			sbuf_finish(&sb);
587 			devctl_notify_f("RCTL", "rule", "matched",
588 			    sbuf_data(&sb), M_NOWAIT);
589 			sbuf_delete(&sb);
590 			free(buf, M_RCTL);
591 			link->rrl_exceeded = 1;
592 			continue;
593 		case RCTL_ACTION_THROTTLE:
594 			if (p->p_state != PRS_NORMAL)
595 				continue;
596 
597 			/*
598 			 * Make the process sleep for a fraction of second
599 			 * proportional to the ratio of process' resource
600 			 * utilization compared to the limit.  The point is
601 			 * to penalize resource hogs: processes that consume
602 			 * more of the available resources sleep for longer.
603 			 *
604 			 * We're trying to defer division until the very end,
605 			 * to minimize the rounding effects.  The following
606 			 * calculation could have been written in a clearer
607 			 * way like this:
608 			 *
609 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
610 			 *     rule->rr_amount;
611 			 * sleep_ms *= rctl_throttle_pct / 100;
612 			 * if (sleep_ms < rctl_throttle_min)
613 			 *         sleep_ms = rctl_throttle_min;
614 			 *
615 			 */
616 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
617 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
618 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
619 				sleep_ms = rctl_throttle_min * rule->rr_amount;
620 
621 			/*
622 			 * Multiply that by the ratio of the resource
623 			 * consumption for the container compared to the limit,
624 			 * squared.  In other words, a process in a container
625 			 * that is two times over the limit will be throttled
626 			 * four times as much for hitting the same rule.  The
627 			 * point is to penalize processes more if the container
628 			 * itself (eg certain UID or jail) is above the limit.
629 			 */
630 			if (available < 0)
631 				sleep_ratio = -available / rule->rr_amount;
632 			else
633 				sleep_ratio = 0;
634 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
635 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
636 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
637 
638 			/*
639 			 * Finally the division.
640 			 */
641 			sleep_ms /= rule->rr_amount;
642 
643 			if (sleep_ms > rctl_throttle_max)
644 				sleep_ms = rctl_throttle_max;
645 #if 0
646 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
647 			   __func__, p->p_pid, p->p_comm,
648 			   p->p_racct->r_resources[resource],
649 			   rule->rr_amount, (uintmax_t)sleep_ms,
650 			   (uintmax_t)sleep_ratio, (intmax_t)available);
651 #endif
652 
653 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
654 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
655 			racct_proc_throttle(p, sleep_ms);
656 			continue;
657 		default:
658 			if (link->rrl_exceeded != 0)
659 				continue;
660 
661 			if (p->p_state != PRS_NORMAL)
662 				continue;
663 
664 			KASSERT(rule->rr_action > 0 &&
665 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
666 			    ("rctl_enforce: unknown action %d",
667 			     rule->rr_action));
668 
669 			/*
670 			 * We're using the fact that RCTL_ACTION_SIG* values
671 			 * are equal to their counterparts from sys/signal.h.
672 			 */
673 			kern_psignal(p, rule->rr_action);
674 			link->rrl_exceeded = 1;
675 			continue;
676 		}
677 	}
678 
679 	if (should_deny) {
680 		/*
681 		 * Return fake error code; the caller should change it
682 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
683 		 */
684 		return (EDOOFUS);
685 	}
686 
687 	return (0);
688 }
689 
690 uint64_t
691 rctl_get_limit(struct proc *p, int resource)
692 {
693 	struct rctl_rule *rule;
694 	struct rctl_rule_link *link;
695 	uint64_t amount = UINT64_MAX;
696 
697 	ASSERT_RACCT_ENABLED();
698 	RACCT_LOCK_ASSERT();
699 
700 	/*
701 	 * There may be more than one matching rule; go through all of them.
702 	 * Denial should be done last, after logging and sending signals.
703 	 */
704 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
705 		rule = link->rrl_rule;
706 		if (rule->rr_resource != resource)
707 			continue;
708 		if (rule->rr_action != RCTL_ACTION_DENY)
709 			continue;
710 		if (rule->rr_amount < amount)
711 			amount = rule->rr_amount;
712 	}
713 
714 	return (amount);
715 }
716 
717 uint64_t
718 rctl_get_available(struct proc *p, int resource)
719 {
720 	struct rctl_rule *rule;
721 	struct rctl_rule_link *link;
722 	int64_t available, minavailable, allocated;
723 
724 	minavailable = INT64_MAX;
725 
726 	ASSERT_RACCT_ENABLED();
727 	RACCT_LOCK_ASSERT();
728 
729 	/*
730 	 * There may be more than one matching rule; go through all of them.
731 	 * Denial should be done last, after logging and sending signals.
732 	 */
733 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
734 		rule = link->rrl_rule;
735 		if (rule->rr_resource != resource)
736 			continue;
737 		if (rule->rr_action != RCTL_ACTION_DENY)
738 			continue;
739 		available = rctl_available_resource(p, rule);
740 		if (available < minavailable)
741 			minavailable = available;
742 	}
743 
744 	/*
745 	 * XXX: Think about this _hard_.
746 	 */
747 	allocated = p->p_racct->r_resources[resource];
748 	if (minavailable < INT64_MAX - allocated)
749 		minavailable += allocated;
750 	if (minavailable < 0)
751 		minavailable = 0;
752 
753 	return (minavailable);
754 }
755 
756 static int
757 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
758 {
759 
760 	ASSERT_RACCT_ENABLED();
761 
762 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
763 		if (rule->rr_subject_type != filter->rr_subject_type)
764 			return (0);
765 
766 		switch (filter->rr_subject_type) {
767 		case RCTL_SUBJECT_TYPE_PROCESS:
768 			if (filter->rr_subject.rs_proc != NULL &&
769 			    rule->rr_subject.rs_proc !=
770 			    filter->rr_subject.rs_proc)
771 				return (0);
772 			break;
773 		case RCTL_SUBJECT_TYPE_USER:
774 			if (filter->rr_subject.rs_uip != NULL &&
775 			    rule->rr_subject.rs_uip !=
776 			    filter->rr_subject.rs_uip)
777 				return (0);
778 			break;
779 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
780 			if (filter->rr_subject.rs_loginclass != NULL &&
781 			    rule->rr_subject.rs_loginclass !=
782 			    filter->rr_subject.rs_loginclass)
783 				return (0);
784 			break;
785 		case RCTL_SUBJECT_TYPE_JAIL:
786 			if (filter->rr_subject.rs_prison_racct != NULL &&
787 			    rule->rr_subject.rs_prison_racct !=
788 			    filter->rr_subject.rs_prison_racct)
789 				return (0);
790 			break;
791 		default:
792 			panic("rctl_rule_matches: unknown subject type %d",
793 			    filter->rr_subject_type);
794 		}
795 	}
796 
797 	if (filter->rr_resource != RACCT_UNDEFINED) {
798 		if (rule->rr_resource != filter->rr_resource)
799 			return (0);
800 	}
801 
802 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
803 		if (rule->rr_action != filter->rr_action)
804 			return (0);
805 	}
806 
807 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
808 		if (rule->rr_amount != filter->rr_amount)
809 			return (0);
810 	}
811 
812 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
813 		if (rule->rr_per != filter->rr_per)
814 			return (0);
815 	}
816 
817 	return (1);
818 }
819 
820 static int
821 str2value(const char *str, int *value, struct dict *table)
822 {
823 	int i;
824 
825 	if (value == NULL)
826 		return (EINVAL);
827 
828 	for (i = 0; table[i].d_name != NULL; i++) {
829 		if (strcasecmp(table[i].d_name, str) == 0) {
830 			*value =  table[i].d_value;
831 			return (0);
832 		}
833 	}
834 
835 	return (EINVAL);
836 }
837 
838 static int
839 str2id(const char *str, id_t *value)
840 {
841 	char *end;
842 
843 	if (str == NULL)
844 		return (EINVAL);
845 
846 	*value = strtoul(str, &end, 10);
847 	if ((size_t)(end - str) != strlen(str))
848 		return (EINVAL);
849 
850 	return (0);
851 }
852 
853 static int
854 str2int64(const char *str, int64_t *value)
855 {
856 	char *end;
857 
858 	if (str == NULL)
859 		return (EINVAL);
860 
861 	*value = strtoul(str, &end, 10);
862 	if ((size_t)(end - str) != strlen(str))
863 		return (EINVAL);
864 
865 	if (*value < 0)
866 		return (ERANGE);
867 
868 	return (0);
869 }
870 
871 /*
872  * Connect the rule to the racct, increasing refcount for the rule.
873  */
874 static void
875 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
876 {
877 	struct rctl_rule_link *link;
878 
879 	ASSERT_RACCT_ENABLED();
880 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
881 
882 	rctl_rule_acquire(rule);
883 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
884 	link->rrl_rule = rule;
885 	link->rrl_exceeded = 0;
886 
887 	RACCT_LOCK();
888 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
889 	RACCT_UNLOCK();
890 }
891 
892 static int
893 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
894 {
895 	struct rctl_rule_link *link;
896 
897 	ASSERT_RACCT_ENABLED();
898 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
899 	RACCT_LOCK_ASSERT();
900 
901 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
902 	if (link == NULL)
903 		return (ENOMEM);
904 	rctl_rule_acquire(rule);
905 	link->rrl_rule = rule;
906 	link->rrl_exceeded = 0;
907 
908 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
909 
910 	return (0);
911 }
912 
913 /*
914  * Remove limits for a rules matching the filter and release
915  * the refcounts for the rules, possibly freeing them.  Returns
916  * the number of limit structures removed.
917  */
918 static int
919 rctl_racct_remove_rules(struct racct *racct,
920     const struct rctl_rule *filter)
921 {
922 	struct rctl_rule_link *link, *linktmp;
923 	int removed = 0;
924 
925 	ASSERT_RACCT_ENABLED();
926 	RACCT_LOCK_ASSERT();
927 
928 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
929 		if (!rctl_rule_matches(link->rrl_rule, filter))
930 			continue;
931 
932 		LIST_REMOVE(link, rrl_next);
933 		rctl_rule_release(link->rrl_rule);
934 		uma_zfree(rctl_rule_link_zone, link);
935 		removed++;
936 	}
937 	return (removed);
938 }
939 
940 static void
941 rctl_rule_acquire_subject(struct rctl_rule *rule)
942 {
943 
944 	ASSERT_RACCT_ENABLED();
945 
946 	switch (rule->rr_subject_type) {
947 	case RCTL_SUBJECT_TYPE_UNDEFINED:
948 	case RCTL_SUBJECT_TYPE_PROCESS:
949 		break;
950 	case RCTL_SUBJECT_TYPE_JAIL:
951 		if (rule->rr_subject.rs_prison_racct != NULL)
952 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
953 		break;
954 	case RCTL_SUBJECT_TYPE_USER:
955 		if (rule->rr_subject.rs_uip != NULL)
956 			uihold(rule->rr_subject.rs_uip);
957 		break;
958 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
959 		if (rule->rr_subject.rs_loginclass != NULL)
960 			loginclass_hold(rule->rr_subject.rs_loginclass);
961 		break;
962 	default:
963 		panic("rctl_rule_acquire_subject: unknown subject type %d",
964 		    rule->rr_subject_type);
965 	}
966 }
967 
968 static void
969 rctl_rule_release_subject(struct rctl_rule *rule)
970 {
971 
972 	ASSERT_RACCT_ENABLED();
973 
974 	switch (rule->rr_subject_type) {
975 	case RCTL_SUBJECT_TYPE_UNDEFINED:
976 	case RCTL_SUBJECT_TYPE_PROCESS:
977 		break;
978 	case RCTL_SUBJECT_TYPE_JAIL:
979 		if (rule->rr_subject.rs_prison_racct != NULL)
980 			prison_racct_free(rule->rr_subject.rs_prison_racct);
981 		break;
982 	case RCTL_SUBJECT_TYPE_USER:
983 		if (rule->rr_subject.rs_uip != NULL)
984 			uifree(rule->rr_subject.rs_uip);
985 		break;
986 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
987 		if (rule->rr_subject.rs_loginclass != NULL)
988 			loginclass_free(rule->rr_subject.rs_loginclass);
989 		break;
990 	default:
991 		panic("rctl_rule_release_subject: unknown subject type %d",
992 		    rule->rr_subject_type);
993 	}
994 }
995 
996 struct rctl_rule *
997 rctl_rule_alloc(int flags)
998 {
999 	struct rctl_rule *rule;
1000 
1001 	ASSERT_RACCT_ENABLED();
1002 
1003 	rule = uma_zalloc(rctl_rule_zone, flags);
1004 	if (rule == NULL)
1005 		return (NULL);
1006 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1007 	rule->rr_subject.rs_proc = NULL;
1008 	rule->rr_subject.rs_uip = NULL;
1009 	rule->rr_subject.rs_loginclass = NULL;
1010 	rule->rr_subject.rs_prison_racct = NULL;
1011 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1012 	rule->rr_resource = RACCT_UNDEFINED;
1013 	rule->rr_action = RCTL_ACTION_UNDEFINED;
1014 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1015 	refcount_init(&rule->rr_refcount, 1);
1016 
1017 	return (rule);
1018 }
1019 
1020 struct rctl_rule *
1021 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1022 {
1023 	struct rctl_rule *copy;
1024 
1025 	ASSERT_RACCT_ENABLED();
1026 
1027 	copy = uma_zalloc(rctl_rule_zone, flags);
1028 	if (copy == NULL)
1029 		return (NULL);
1030 	copy->rr_subject_type = rule->rr_subject_type;
1031 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1032 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1033 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1034 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1035 	copy->rr_per = rule->rr_per;
1036 	copy->rr_resource = rule->rr_resource;
1037 	copy->rr_action = rule->rr_action;
1038 	copy->rr_amount = rule->rr_amount;
1039 	refcount_init(&copy->rr_refcount, 1);
1040 	rctl_rule_acquire_subject(copy);
1041 
1042 	return (copy);
1043 }
1044 
1045 void
1046 rctl_rule_acquire(struct rctl_rule *rule)
1047 {
1048 
1049 	ASSERT_RACCT_ENABLED();
1050 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1051 
1052 	refcount_acquire(&rule->rr_refcount);
1053 }
1054 
1055 static void
1056 rctl_rule_free(void *context, int pending)
1057 {
1058 	struct rctl_rule *rule;
1059 
1060 	rule = (struct rctl_rule *)context;
1061 
1062 	ASSERT_RACCT_ENABLED();
1063 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1064 
1065 	/*
1066 	 * We don't need locking here; rule is guaranteed to be inaccessible.
1067 	 */
1068 
1069 	rctl_rule_release_subject(rule);
1070 	uma_zfree(rctl_rule_zone, rule);
1071 }
1072 
1073 void
1074 rctl_rule_release(struct rctl_rule *rule)
1075 {
1076 
1077 	ASSERT_RACCT_ENABLED();
1078 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1079 
1080 	if (refcount_release(&rule->rr_refcount)) {
1081 		/*
1082 		 * rctl_rule_release() is often called when iterating
1083 		 * over all the uidinfo structures in the system,
1084 		 * holding uihashtbl_lock.  Since rctl_rule_free()
1085 		 * might end up calling uifree(), this would lead
1086 		 * to lock recursion.  Use taskqueue to avoid this.
1087 		 */
1088 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1089 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1090 	}
1091 }
1092 
1093 static int
1094 rctl_rule_fully_specified(const struct rctl_rule *rule)
1095 {
1096 
1097 	ASSERT_RACCT_ENABLED();
1098 
1099 	switch (rule->rr_subject_type) {
1100 	case RCTL_SUBJECT_TYPE_UNDEFINED:
1101 		return (0);
1102 	case RCTL_SUBJECT_TYPE_PROCESS:
1103 		if (rule->rr_subject.rs_proc == NULL)
1104 			return (0);
1105 		break;
1106 	case RCTL_SUBJECT_TYPE_USER:
1107 		if (rule->rr_subject.rs_uip == NULL)
1108 			return (0);
1109 		break;
1110 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1111 		if (rule->rr_subject.rs_loginclass == NULL)
1112 			return (0);
1113 		break;
1114 	case RCTL_SUBJECT_TYPE_JAIL:
1115 		if (rule->rr_subject.rs_prison_racct == NULL)
1116 			return (0);
1117 		break;
1118 	default:
1119 		panic("rctl_rule_fully_specified: unknown subject type %d",
1120 		    rule->rr_subject_type);
1121 	}
1122 	if (rule->rr_resource == RACCT_UNDEFINED)
1123 		return (0);
1124 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1125 		return (0);
1126 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1127 		return (0);
1128 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1129 		return (0);
1130 
1131 	return (1);
1132 }
1133 
1134 static int
1135 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1136 {
1137 	struct rctl_rule *rule;
1138 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1139 	     *amountstr, *perstr;
1140 	id_t id;
1141 	int error = 0;
1142 
1143 	ASSERT_RACCT_ENABLED();
1144 
1145 	rule = rctl_rule_alloc(M_WAITOK);
1146 
1147 	subjectstr = strsep(&rulestr, ":");
1148 	subject_idstr = strsep(&rulestr, ":");
1149 	resourcestr = strsep(&rulestr, ":");
1150 	actionstr = strsep(&rulestr, "=/");
1151 	amountstr = strsep(&rulestr, "/");
1152 	perstr = rulestr;
1153 
1154 	if (subjectstr == NULL || subjectstr[0] == '\0')
1155 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1156 	else {
1157 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1158 		if (error != 0)
1159 			goto out;
1160 	}
1161 
1162 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1163 		rule->rr_subject.rs_proc = NULL;
1164 		rule->rr_subject.rs_uip = NULL;
1165 		rule->rr_subject.rs_loginclass = NULL;
1166 		rule->rr_subject.rs_prison_racct = NULL;
1167 	} else {
1168 		switch (rule->rr_subject_type) {
1169 		case RCTL_SUBJECT_TYPE_UNDEFINED:
1170 			error = EINVAL;
1171 			goto out;
1172 		case RCTL_SUBJECT_TYPE_PROCESS:
1173 			error = str2id(subject_idstr, &id);
1174 			if (error != 0)
1175 				goto out;
1176 			sx_assert(&allproc_lock, SA_LOCKED);
1177 			rule->rr_subject.rs_proc = pfind(id);
1178 			if (rule->rr_subject.rs_proc == NULL) {
1179 				error = ESRCH;
1180 				goto out;
1181 			}
1182 			PROC_UNLOCK(rule->rr_subject.rs_proc);
1183 			break;
1184 		case RCTL_SUBJECT_TYPE_USER:
1185 			error = str2id(subject_idstr, &id);
1186 			if (error != 0)
1187 				goto out;
1188 			rule->rr_subject.rs_uip = uifind(id);
1189 			break;
1190 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1191 			rule->rr_subject.rs_loginclass =
1192 			    loginclass_find(subject_idstr);
1193 			if (rule->rr_subject.rs_loginclass == NULL) {
1194 				error = ENAMETOOLONG;
1195 				goto out;
1196 			}
1197 			break;
1198 		case RCTL_SUBJECT_TYPE_JAIL:
1199 			rule->rr_subject.rs_prison_racct =
1200 			    prison_racct_find(subject_idstr);
1201 			if (rule->rr_subject.rs_prison_racct == NULL) {
1202 				error = ENAMETOOLONG;
1203 				goto out;
1204 			}
1205 			break;
1206                default:
1207                        panic("rctl_string_to_rule: unknown subject type %d",
1208                            rule->rr_subject_type);
1209                }
1210 	}
1211 
1212 	if (resourcestr == NULL || resourcestr[0] == '\0')
1213 		rule->rr_resource = RACCT_UNDEFINED;
1214 	else {
1215 		error = str2value(resourcestr, &rule->rr_resource,
1216 		    resourcenames);
1217 		if (error != 0)
1218 			goto out;
1219 	}
1220 
1221 	if (actionstr == NULL || actionstr[0] == '\0')
1222 		rule->rr_action = RCTL_ACTION_UNDEFINED;
1223 	else {
1224 		error = str2value(actionstr, &rule->rr_action, actionnames);
1225 		if (error != 0)
1226 			goto out;
1227 	}
1228 
1229 	if (amountstr == NULL || amountstr[0] == '\0')
1230 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1231 	else {
1232 		error = str2int64(amountstr, &rule->rr_amount);
1233 		if (error != 0)
1234 			goto out;
1235 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1236 			if (rule->rr_amount > INT64_MAX / 1000000) {
1237 				error = ERANGE;
1238 				goto out;
1239 			}
1240 			rule->rr_amount *= 1000000;
1241 		}
1242 	}
1243 
1244 	if (perstr == NULL || perstr[0] == '\0')
1245 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1246 	else {
1247 		error = str2value(perstr, &rule->rr_per, subjectnames);
1248 		if (error != 0)
1249 			goto out;
1250 	}
1251 
1252 out:
1253 	if (error == 0)
1254 		*rulep = rule;
1255 	else
1256 		rctl_rule_release(rule);
1257 
1258 	return (error);
1259 }
1260 
1261 /*
1262  * Link a rule with all the subjects it applies to.
1263  */
1264 int
1265 rctl_rule_add(struct rctl_rule *rule)
1266 {
1267 	struct proc *p;
1268 	struct ucred *cred;
1269 	struct uidinfo *uip;
1270 	struct prison *pr;
1271 	struct prison_racct *prr;
1272 	struct loginclass *lc;
1273 	struct rctl_rule *rule2;
1274 	int match;
1275 
1276 	ASSERT_RACCT_ENABLED();
1277 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1278 
1279 	/*
1280 	 * Some rules just don't make sense, like "deny" rule for an undeniable
1281 	 * resource.  The exception are the RSS and %CPU resources - they are
1282 	 * not deniable in the racct sense, but the limit is enforced in
1283 	 * a different way.
1284 	 */
1285 	if (rule->rr_action == RCTL_ACTION_DENY &&
1286 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1287 	    rule->rr_resource != RACCT_RSS &&
1288 	    rule->rr_resource != RACCT_PCTCPU) {
1289 		return (EOPNOTSUPP);
1290 	}
1291 
1292 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1293 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1294 		return (EOPNOTSUPP);
1295 	}
1296 
1297 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1298 	    rule->rr_resource == RACCT_PCTCPU) {
1299 		return (EOPNOTSUPP);
1300 	}
1301 
1302 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1303 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1304 		return (EOPNOTSUPP);
1305 	}
1306 
1307 	/*
1308 	 * Make sure there are no duplicated rules.  Also, for the "deny"
1309 	 * rules, remove ones differing only by "amount".
1310 	 */
1311 	if (rule->rr_action == RCTL_ACTION_DENY) {
1312 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1313 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1314 		rctl_rule_remove(rule2);
1315 		rctl_rule_release(rule2);
1316 	} else
1317 		rctl_rule_remove(rule);
1318 
1319 	switch (rule->rr_subject_type) {
1320 	case RCTL_SUBJECT_TYPE_PROCESS:
1321 		p = rule->rr_subject.rs_proc;
1322 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1323 
1324 		rctl_racct_add_rule(p->p_racct, rule);
1325 		/*
1326 		 * In case of per-process rule, we don't have anything more
1327 		 * to do.
1328 		 */
1329 		return (0);
1330 
1331 	case RCTL_SUBJECT_TYPE_USER:
1332 		uip = rule->rr_subject.rs_uip;
1333 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1334 		rctl_racct_add_rule(uip->ui_racct, rule);
1335 		break;
1336 
1337 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1338 		lc = rule->rr_subject.rs_loginclass;
1339 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1340 		rctl_racct_add_rule(lc->lc_racct, rule);
1341 		break;
1342 
1343 	case RCTL_SUBJECT_TYPE_JAIL:
1344 		prr = rule->rr_subject.rs_prison_racct;
1345 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1346 		rctl_racct_add_rule(prr->prr_racct, rule);
1347 		break;
1348 
1349 	default:
1350 		panic("rctl_rule_add: unknown subject type %d",
1351 		    rule->rr_subject_type);
1352 	}
1353 
1354 	/*
1355 	 * Now go through all the processes and add the new rule to the ones
1356 	 * it applies to.
1357 	 */
1358 	sx_assert(&allproc_lock, SA_LOCKED);
1359 	FOREACH_PROC_IN_SYSTEM(p) {
1360 		cred = p->p_ucred;
1361 		switch (rule->rr_subject_type) {
1362 		case RCTL_SUBJECT_TYPE_USER:
1363 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1364 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1365 				break;
1366 			continue;
1367 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1368 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1369 				break;
1370 			continue;
1371 		case RCTL_SUBJECT_TYPE_JAIL:
1372 			match = 0;
1373 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1374 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1375 					match = 1;
1376 					break;
1377 				}
1378 			}
1379 			if (match)
1380 				break;
1381 			continue;
1382 		default:
1383 			panic("rctl_rule_add: unknown subject type %d",
1384 			    rule->rr_subject_type);
1385 		}
1386 
1387 		rctl_racct_add_rule(p->p_racct, rule);
1388 	}
1389 
1390 	return (0);
1391 }
1392 
1393 static void
1394 rctl_rule_pre_callback(void)
1395 {
1396 
1397 	RACCT_LOCK();
1398 }
1399 
1400 static void
1401 rctl_rule_post_callback(void)
1402 {
1403 
1404 	RACCT_UNLOCK();
1405 }
1406 
1407 static void
1408 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1409 {
1410 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1411 	int found = 0;
1412 
1413 	ASSERT_RACCT_ENABLED();
1414 	RACCT_LOCK_ASSERT();
1415 
1416 	found += rctl_racct_remove_rules(racct, filter);
1417 
1418 	*((int *)arg3) += found;
1419 }
1420 
1421 /*
1422  * Remove all rules that match the filter.
1423  */
1424 int
1425 rctl_rule_remove(struct rctl_rule *filter)
1426 {
1427 	struct proc *p;
1428 	int found = 0;
1429 
1430 	ASSERT_RACCT_ENABLED();
1431 
1432 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1433 	    filter->rr_subject.rs_proc != NULL) {
1434 		p = filter->rr_subject.rs_proc;
1435 		RACCT_LOCK();
1436 		found = rctl_racct_remove_rules(p->p_racct, filter);
1437 		RACCT_UNLOCK();
1438 		if (found)
1439 			return (0);
1440 		return (ESRCH);
1441 	}
1442 
1443 	loginclass_racct_foreach(rctl_rule_remove_callback,
1444 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1445 	    filter, (void *)&found);
1446 	ui_racct_foreach(rctl_rule_remove_callback,
1447 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1448 	    filter, (void *)&found);
1449 	prison_racct_foreach(rctl_rule_remove_callback,
1450 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1451 	    filter, (void *)&found);
1452 
1453 	sx_assert(&allproc_lock, SA_LOCKED);
1454 	RACCT_LOCK();
1455 	FOREACH_PROC_IN_SYSTEM(p) {
1456 		found += rctl_racct_remove_rules(p->p_racct, filter);
1457 	}
1458 	RACCT_UNLOCK();
1459 
1460 	if (found)
1461 		return (0);
1462 	return (ESRCH);
1463 }
1464 
1465 /*
1466  * Appends a rule to the sbuf.
1467  */
1468 static void
1469 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1470 {
1471 	int64_t amount;
1472 
1473 	ASSERT_RACCT_ENABLED();
1474 
1475 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1476 
1477 	switch (rule->rr_subject_type) {
1478 	case RCTL_SUBJECT_TYPE_PROCESS:
1479 		if (rule->rr_subject.rs_proc == NULL)
1480 			sbuf_printf(sb, ":");
1481 		else
1482 			sbuf_printf(sb, "%d:",
1483 			    rule->rr_subject.rs_proc->p_pid);
1484 		break;
1485 	case RCTL_SUBJECT_TYPE_USER:
1486 		if (rule->rr_subject.rs_uip == NULL)
1487 			sbuf_printf(sb, ":");
1488 		else
1489 			sbuf_printf(sb, "%d:",
1490 			    rule->rr_subject.rs_uip->ui_uid);
1491 		break;
1492 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1493 		if (rule->rr_subject.rs_loginclass == NULL)
1494 			sbuf_printf(sb, ":");
1495 		else
1496 			sbuf_printf(sb, "%s:",
1497 			    rule->rr_subject.rs_loginclass->lc_name);
1498 		break;
1499 	case RCTL_SUBJECT_TYPE_JAIL:
1500 		if (rule->rr_subject.rs_prison_racct == NULL)
1501 			sbuf_printf(sb, ":");
1502 		else
1503 			sbuf_printf(sb, "%s:",
1504 			    rule->rr_subject.rs_prison_racct->prr_name);
1505 		break;
1506 	default:
1507 		panic("rctl_rule_to_sbuf: unknown subject type %d",
1508 		    rule->rr_subject_type);
1509 	}
1510 
1511 	amount = rule->rr_amount;
1512 	if (amount != RCTL_AMOUNT_UNDEFINED &&
1513 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1514 		amount /= 1000000;
1515 
1516 	sbuf_printf(sb, "%s:%s=%jd",
1517 	    rctl_resource_name(rule->rr_resource),
1518 	    rctl_action_name(rule->rr_action),
1519 	    amount);
1520 
1521 	if (rule->rr_per != rule->rr_subject_type)
1522 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1523 }
1524 
1525 /*
1526  * Routine used by RCTL syscalls to read in input string.
1527  */
1528 static int
1529 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1530 {
1531 	char *str;
1532 	int error;
1533 
1534 	ASSERT_RACCT_ENABLED();
1535 
1536 	if (inbuflen <= 0)
1537 		return (EINVAL);
1538 	if (inbuflen > RCTL_MAX_INBUFSIZE)
1539 		return (E2BIG);
1540 
1541 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1542 	error = copyinstr(inbufp, str, inbuflen, NULL);
1543 	if (error != 0) {
1544 		free(str, M_RCTL);
1545 		return (error);
1546 	}
1547 
1548 	*inputstr = str;
1549 
1550 	return (0);
1551 }
1552 
1553 /*
1554  * Routine used by RCTL syscalls to write out output string.
1555  */
1556 static int
1557 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1558 {
1559 	int error;
1560 
1561 	ASSERT_RACCT_ENABLED();
1562 
1563 	if (outputsbuf == NULL)
1564 		return (0);
1565 
1566 	sbuf_finish(outputsbuf);
1567 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1568 		sbuf_delete(outputsbuf);
1569 		return (ERANGE);
1570 	}
1571 	error = copyout(sbuf_data(outputsbuf), outbufp,
1572 	    sbuf_len(outputsbuf) + 1);
1573 	sbuf_delete(outputsbuf);
1574 	return (error);
1575 }
1576 
1577 static struct sbuf *
1578 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1579 {
1580 	struct sbuf *sb;
1581 	int64_t amount;
1582 	int i;
1583 
1584 	ASSERT_RACCT_ENABLED();
1585 
1586 	sb = sbuf_new_auto();
1587 	for (i = 0; i <= RACCT_MAX; i++) {
1588 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1589 			continue;
1590 		RACCT_LOCK();
1591 		amount = racct->r_resources[i];
1592 		RACCT_UNLOCK();
1593 		if (RACCT_IS_IN_MILLIONS(i))
1594 			amount /= 1000000;
1595 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1596 	}
1597 	sbuf_setpos(sb, sbuf_len(sb) - 1);
1598 	return (sb);
1599 }
1600 
1601 int
1602 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1603 {
1604 	struct rctl_rule *filter;
1605 	struct sbuf *outputsbuf = NULL;
1606 	struct proc *p;
1607 	struct uidinfo *uip;
1608 	struct loginclass *lc;
1609 	struct prison_racct *prr;
1610 	char *inputstr;
1611 	int error;
1612 
1613 	if (!racct_enable)
1614 		return (ENOSYS);
1615 
1616 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1617 	if (error != 0)
1618 		return (error);
1619 
1620 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1621 	if (error != 0)
1622 		return (error);
1623 
1624 	sx_slock(&allproc_lock);
1625 	error = rctl_string_to_rule(inputstr, &filter);
1626 	free(inputstr, M_RCTL);
1627 	if (error != 0) {
1628 		sx_sunlock(&allproc_lock);
1629 		return (error);
1630 	}
1631 
1632 	switch (filter->rr_subject_type) {
1633 	case RCTL_SUBJECT_TYPE_PROCESS:
1634 		p = filter->rr_subject.rs_proc;
1635 		if (p == NULL) {
1636 			error = EINVAL;
1637 			goto out;
1638 		}
1639 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1640 		break;
1641 	case RCTL_SUBJECT_TYPE_USER:
1642 		uip = filter->rr_subject.rs_uip;
1643 		if (uip == NULL) {
1644 			error = EINVAL;
1645 			goto out;
1646 		}
1647 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1648 		break;
1649 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1650 		lc = filter->rr_subject.rs_loginclass;
1651 		if (lc == NULL) {
1652 			error = EINVAL;
1653 			goto out;
1654 		}
1655 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1656 		break;
1657 	case RCTL_SUBJECT_TYPE_JAIL:
1658 		prr = filter->rr_subject.rs_prison_racct;
1659 		if (prr == NULL) {
1660 			error = EINVAL;
1661 			goto out;
1662 		}
1663 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1664 		break;
1665 	default:
1666 		error = EINVAL;
1667 	}
1668 out:
1669 	rctl_rule_release(filter);
1670 	sx_sunlock(&allproc_lock);
1671 	if (error != 0)
1672 		return (error);
1673 
1674 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1675 
1676 	return (error);
1677 }
1678 
1679 static void
1680 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1681 {
1682 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1683 	struct rctl_rule_link *link;
1684 	struct sbuf *sb = (struct sbuf *)arg3;
1685 
1686 	ASSERT_RACCT_ENABLED();
1687 	RACCT_LOCK_ASSERT();
1688 
1689 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1690 		if (!rctl_rule_matches(link->rrl_rule, filter))
1691 			continue;
1692 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1693 		sbuf_printf(sb, ",");
1694 	}
1695 }
1696 
1697 int
1698 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1699 {
1700 	struct sbuf *sb;
1701 	struct rctl_rule *filter;
1702 	struct rctl_rule_link *link;
1703 	struct proc *p;
1704 	char *inputstr, *buf;
1705 	size_t bufsize;
1706 	int error;
1707 
1708 	if (!racct_enable)
1709 		return (ENOSYS);
1710 
1711 	error = priv_check(td, PRIV_RCTL_GET_RULES);
1712 	if (error != 0)
1713 		return (error);
1714 
1715 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1716 	if (error != 0)
1717 		return (error);
1718 
1719 	sx_slock(&allproc_lock);
1720 	error = rctl_string_to_rule(inputstr, &filter);
1721 	free(inputstr, M_RCTL);
1722 	if (error != 0) {
1723 		sx_sunlock(&allproc_lock);
1724 		return (error);
1725 	}
1726 
1727 	bufsize = uap->outbuflen;
1728 	if (bufsize > rctl_maxbufsize) {
1729 		sx_sunlock(&allproc_lock);
1730 		return (E2BIG);
1731 	}
1732 
1733 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1734 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1735 	KASSERT(sb != NULL, ("sbuf_new failed"));
1736 
1737 	FOREACH_PROC_IN_SYSTEM(p) {
1738 		RACCT_LOCK();
1739 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1740 			/*
1741 			 * Non-process rules will be added to the buffer later.
1742 			 * Adding them here would result in duplicated output.
1743 			 */
1744 			if (link->rrl_rule->rr_subject_type !=
1745 			    RCTL_SUBJECT_TYPE_PROCESS)
1746 				continue;
1747 			if (!rctl_rule_matches(link->rrl_rule, filter))
1748 				continue;
1749 			rctl_rule_to_sbuf(sb, link->rrl_rule);
1750 			sbuf_printf(sb, ",");
1751 		}
1752 		RACCT_UNLOCK();
1753 	}
1754 
1755 	loginclass_racct_foreach(rctl_get_rules_callback,
1756 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1757 	    filter, sb);
1758 	ui_racct_foreach(rctl_get_rules_callback,
1759 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1760 	    filter, sb);
1761 	prison_racct_foreach(rctl_get_rules_callback,
1762 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1763 	    filter, sb);
1764 	if (sbuf_error(sb) == ENOMEM) {
1765 		error = ERANGE;
1766 		goto out;
1767 	}
1768 
1769 	/*
1770 	 * Remove trailing ",".
1771 	 */
1772 	if (sbuf_len(sb) > 0)
1773 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1774 
1775 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1776 out:
1777 	rctl_rule_release(filter);
1778 	sx_sunlock(&allproc_lock);
1779 	free(buf, M_RCTL);
1780 	return (error);
1781 }
1782 
1783 int
1784 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1785 {
1786 	struct sbuf *sb;
1787 	struct rctl_rule *filter;
1788 	struct rctl_rule_link *link;
1789 	char *inputstr, *buf;
1790 	size_t bufsize;
1791 	int error;
1792 
1793 	if (!racct_enable)
1794 		return (ENOSYS);
1795 
1796 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1797 	if (error != 0)
1798 		return (error);
1799 
1800 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1801 	if (error != 0)
1802 		return (error);
1803 
1804 	sx_slock(&allproc_lock);
1805 	error = rctl_string_to_rule(inputstr, &filter);
1806 	free(inputstr, M_RCTL);
1807 	if (error != 0) {
1808 		sx_sunlock(&allproc_lock);
1809 		return (error);
1810 	}
1811 
1812 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1813 		rctl_rule_release(filter);
1814 		sx_sunlock(&allproc_lock);
1815 		return (EINVAL);
1816 	}
1817 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1818 		rctl_rule_release(filter);
1819 		sx_sunlock(&allproc_lock);
1820 		return (EOPNOTSUPP);
1821 	}
1822 	if (filter->rr_subject.rs_proc == NULL) {
1823 		rctl_rule_release(filter);
1824 		sx_sunlock(&allproc_lock);
1825 		return (EINVAL);
1826 	}
1827 
1828 	bufsize = uap->outbuflen;
1829 	if (bufsize > rctl_maxbufsize) {
1830 		rctl_rule_release(filter);
1831 		sx_sunlock(&allproc_lock);
1832 		return (E2BIG);
1833 	}
1834 
1835 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1836 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1837 	KASSERT(sb != NULL, ("sbuf_new failed"));
1838 
1839 	RACCT_LOCK();
1840 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1841 	    rrl_next) {
1842 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1843 		sbuf_printf(sb, ",");
1844 	}
1845 	RACCT_UNLOCK();
1846 	if (sbuf_error(sb) == ENOMEM) {
1847 		error = ERANGE;
1848 		sbuf_delete(sb);
1849 		goto out;
1850 	}
1851 
1852 	/*
1853 	 * Remove trailing ",".
1854 	 */
1855 	if (sbuf_len(sb) > 0)
1856 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1857 
1858 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1859 out:
1860 	rctl_rule_release(filter);
1861 	sx_sunlock(&allproc_lock);
1862 	free(buf, M_RCTL);
1863 	return (error);
1864 }
1865 
1866 int
1867 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1868 {
1869 	struct rctl_rule *rule;
1870 	char *inputstr;
1871 	int error;
1872 
1873 	if (!racct_enable)
1874 		return (ENOSYS);
1875 
1876 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1877 	if (error != 0)
1878 		return (error);
1879 
1880 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1881 	if (error != 0)
1882 		return (error);
1883 
1884 	sx_slock(&allproc_lock);
1885 	error = rctl_string_to_rule(inputstr, &rule);
1886 	free(inputstr, M_RCTL);
1887 	if (error != 0) {
1888 		sx_sunlock(&allproc_lock);
1889 		return (error);
1890 	}
1891 	/*
1892 	 * The 'per' part of a rule is optional.
1893 	 */
1894 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1895 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1896 		rule->rr_per = rule->rr_subject_type;
1897 
1898 	if (!rctl_rule_fully_specified(rule)) {
1899 		error = EINVAL;
1900 		goto out;
1901 	}
1902 
1903 	error = rctl_rule_add(rule);
1904 
1905 out:
1906 	rctl_rule_release(rule);
1907 	sx_sunlock(&allproc_lock);
1908 	return (error);
1909 }
1910 
1911 int
1912 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1913 {
1914 	struct rctl_rule *filter;
1915 	char *inputstr;
1916 	int error;
1917 
1918 	if (!racct_enable)
1919 		return (ENOSYS);
1920 
1921 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1922 	if (error != 0)
1923 		return (error);
1924 
1925 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1926 	if (error != 0)
1927 		return (error);
1928 
1929 	sx_slock(&allproc_lock);
1930 	error = rctl_string_to_rule(inputstr, &filter);
1931 	free(inputstr, M_RCTL);
1932 	if (error != 0) {
1933 		sx_sunlock(&allproc_lock);
1934 		return (error);
1935 	}
1936 
1937 	error = rctl_rule_remove(filter);
1938 	rctl_rule_release(filter);
1939 	sx_sunlock(&allproc_lock);
1940 
1941 	return (error);
1942 }
1943 
1944 /*
1945  * Update RCTL rule list after credential change.
1946  */
1947 void
1948 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1949 {
1950 	LIST_HEAD(, rctl_rule_link) newrules;
1951 	struct rctl_rule_link *link, *newlink;
1952 	struct uidinfo *newuip;
1953 	struct loginclass *newlc;
1954 	struct prison_racct *newprr;
1955 	int rulecnt, i;
1956 
1957 	ASSERT_RACCT_ENABLED();
1958 
1959 	newuip = newcred->cr_ruidinfo;
1960 	newlc = newcred->cr_loginclass;
1961 	newprr = newcred->cr_prison->pr_prison_racct;
1962 
1963 	LIST_INIT(&newrules);
1964 
1965 again:
1966 	/*
1967 	 * First, count the rules that apply to the process with new
1968 	 * credentials.
1969 	 */
1970 	rulecnt = 0;
1971 	RACCT_LOCK();
1972 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1973 		if (link->rrl_rule->rr_subject_type ==
1974 		    RCTL_SUBJECT_TYPE_PROCESS)
1975 			rulecnt++;
1976 	}
1977 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1978 		rulecnt++;
1979 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1980 		rulecnt++;
1981 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1982 		rulecnt++;
1983 	RACCT_UNLOCK();
1984 
1985 	/*
1986 	 * Create temporary list.  We've dropped the rctl_lock in order
1987 	 * to use M_WAITOK.
1988 	 */
1989 	for (i = 0; i < rulecnt; i++) {
1990 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
1991 		newlink->rrl_rule = NULL;
1992 		newlink->rrl_exceeded = 0;
1993 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
1994 	}
1995 
1996 	newlink = LIST_FIRST(&newrules);
1997 
1998 	/*
1999 	 * Assign rules to the newly allocated list entries.
2000 	 */
2001 	RACCT_LOCK();
2002 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2003 		if (link->rrl_rule->rr_subject_type ==
2004 		    RCTL_SUBJECT_TYPE_PROCESS) {
2005 			if (newlink == NULL)
2006 				goto goaround;
2007 			rctl_rule_acquire(link->rrl_rule);
2008 			newlink->rrl_rule = link->rrl_rule;
2009 			newlink->rrl_exceeded = link->rrl_exceeded;
2010 			newlink = LIST_NEXT(newlink, rrl_next);
2011 			rulecnt--;
2012 		}
2013 	}
2014 
2015 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2016 		if (newlink == NULL)
2017 			goto goaround;
2018 		rctl_rule_acquire(link->rrl_rule);
2019 		newlink->rrl_rule = link->rrl_rule;
2020 		newlink->rrl_exceeded = link->rrl_exceeded;
2021 		newlink = LIST_NEXT(newlink, rrl_next);
2022 		rulecnt--;
2023 	}
2024 
2025 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2026 		if (newlink == NULL)
2027 			goto goaround;
2028 		rctl_rule_acquire(link->rrl_rule);
2029 		newlink->rrl_rule = link->rrl_rule;
2030 		newlink->rrl_exceeded = link->rrl_exceeded;
2031 		newlink = LIST_NEXT(newlink, rrl_next);
2032 		rulecnt--;
2033 	}
2034 
2035 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2036 		if (newlink == NULL)
2037 			goto goaround;
2038 		rctl_rule_acquire(link->rrl_rule);
2039 		newlink->rrl_rule = link->rrl_rule;
2040 		newlink->rrl_exceeded = link->rrl_exceeded;
2041 		newlink = LIST_NEXT(newlink, rrl_next);
2042 		rulecnt--;
2043 	}
2044 
2045 	if (rulecnt == 0) {
2046 		/*
2047 		 * Free the old rule list.
2048 		 */
2049 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2050 			link = LIST_FIRST(&p->p_racct->r_rule_links);
2051 			LIST_REMOVE(link, rrl_next);
2052 			rctl_rule_release(link->rrl_rule);
2053 			uma_zfree(rctl_rule_link_zone, link);
2054 		}
2055 
2056 		/*
2057 		 * Replace lists and we're done.
2058 		 *
2059 		 * XXX: Is there any way to switch list heads instead
2060 		 *      of iterating here?
2061 		 */
2062 		while (!LIST_EMPTY(&newrules)) {
2063 			newlink = LIST_FIRST(&newrules);
2064 			LIST_REMOVE(newlink, rrl_next);
2065 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2066 			    newlink, rrl_next);
2067 		}
2068 
2069 		RACCT_UNLOCK();
2070 
2071 		return;
2072 	}
2073 
2074 goaround:
2075 	RACCT_UNLOCK();
2076 
2077 	/*
2078 	 * Rule list changed while we were not holding the rctl_lock.
2079 	 * Free the new list and try again.
2080 	 */
2081 	while (!LIST_EMPTY(&newrules)) {
2082 		newlink = LIST_FIRST(&newrules);
2083 		LIST_REMOVE(newlink, rrl_next);
2084 		if (newlink->rrl_rule != NULL)
2085 			rctl_rule_release(newlink->rrl_rule);
2086 		uma_zfree(rctl_rule_link_zone, newlink);
2087 	}
2088 
2089 	goto again;
2090 }
2091 
2092 /*
2093  * Assign RCTL rules to the newly created process.
2094  */
2095 int
2096 rctl_proc_fork(struct proc *parent, struct proc *child)
2097 {
2098 	struct rctl_rule *rule;
2099 	struct rctl_rule_link *link;
2100 	int error;
2101 
2102 	ASSERT_RACCT_ENABLED();
2103 	RACCT_LOCK_ASSERT();
2104 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2105 
2106 	LIST_INIT(&child->p_racct->r_rule_links);
2107 
2108 	/*
2109 	 * Go through limits applicable to the parent and assign them
2110 	 * to the child.  Rules with 'process' subject have to be duplicated
2111 	 * in order to make their rr_subject point to the new process.
2112 	 */
2113 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2114 		if (link->rrl_rule->rr_subject_type ==
2115 		    RCTL_SUBJECT_TYPE_PROCESS) {
2116 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2117 			if (rule == NULL)
2118 				goto fail;
2119 			KASSERT(rule->rr_subject.rs_proc == parent,
2120 			    ("rule->rr_subject.rs_proc != parent"));
2121 			rule->rr_subject.rs_proc = child;
2122 			error = rctl_racct_add_rule_locked(child->p_racct,
2123 			    rule);
2124 			rctl_rule_release(rule);
2125 			if (error != 0)
2126 				goto fail;
2127 		} else {
2128 			error = rctl_racct_add_rule_locked(child->p_racct,
2129 			    link->rrl_rule);
2130 			if (error != 0)
2131 				goto fail;
2132 		}
2133 	}
2134 
2135 	return (0);
2136 
2137 fail:
2138 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2139 		link = LIST_FIRST(&child->p_racct->r_rule_links);
2140 		LIST_REMOVE(link, rrl_next);
2141 		rctl_rule_release(link->rrl_rule);
2142 		uma_zfree(rctl_rule_link_zone, link);
2143 	}
2144 
2145 	return (EAGAIN);
2146 }
2147 
2148 /*
2149  * Release rules attached to the racct.
2150  */
2151 void
2152 rctl_racct_release(struct racct *racct)
2153 {
2154 	struct rctl_rule_link *link;
2155 
2156 	ASSERT_RACCT_ENABLED();
2157 	RACCT_LOCK_ASSERT();
2158 
2159 	while (!LIST_EMPTY(&racct->r_rule_links)) {
2160 		link = LIST_FIRST(&racct->r_rule_links);
2161 		LIST_REMOVE(link, rrl_next);
2162 		rctl_rule_release(link->rrl_rule);
2163 		uma_zfree(rctl_rule_link_zone, link);
2164 	}
2165 }
2166 
2167 static void
2168 rctl_init(void)
2169 {
2170 
2171 	if (!racct_enable)
2172 		return;
2173 
2174 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2175 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2176 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2177 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2178 	    UMA_ALIGN_PTR, 0);
2179 
2180 	/*
2181 	 * Set default values, making sure not to overwrite the ones
2182 	 * fetched from tunables.  Most of those could be set at the
2183 	 * declaration, except for the rctl_throttle_max - we cannot
2184 	 * set it there due to hz not being compile time constant.
2185 	 */
2186 	if (rctl_throttle_min < 1)
2187 		rctl_throttle_min = 1;
2188 	if (rctl_throttle_max < rctl_throttle_min)
2189 		rctl_throttle_max = 2 * hz;
2190 	if (rctl_throttle_pct < 0)
2191 		rctl_throttle_pct = 100;
2192 	if (rctl_throttle_pct2 < 0)
2193 		rctl_throttle_pct2 = 100;
2194 }
2195 
2196 #else /* !RCTL */
2197 
2198 int
2199 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2200 {
2201 
2202 	return (ENOSYS);
2203 }
2204 
2205 int
2206 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2207 {
2208 
2209 	return (ENOSYS);
2210 }
2211 
2212 int
2213 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2214 {
2215 
2216 	return (ENOSYS);
2217 }
2218 
2219 int
2220 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2221 {
2222 
2223 	return (ENOSYS);
2224 }
2225 
2226 int
2227 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2228 {
2229 
2230 	return (ENOSYS);
2231 }
2232 
2233 #endif /* !RCTL */
2234