xref: /freebsd/sys/kern/kern_rctl.c (revision 43e29d03f416d7dda52112a29600a7c82ee1a91e)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2010 The FreeBSD Foundation
5  *
6  * This software was developed by Edward Tomasz Napierala under sponsorship
7  * from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * $FreeBSD$
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include <sys/param.h>
37 #include <sys/devctl.h>
38 #include <sys/malloc.h>
39 #include <sys/queue.h>
40 #include <sys/refcount.h>
41 #include <sys/jail.h>
42 #include <sys/kernel.h>
43 #include <sys/limits.h>
44 #include <sys/loginclass.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/racct.h>
48 #include <sys/rctl.h>
49 #include <sys/resourcevar.h>
50 #include <sys/sx.h>
51 #include <sys/sysproto.h>
52 #include <sys/systm.h>
53 #include <sys/types.h>
54 #include <sys/eventhandler.h>
55 #include <sys/lock.h>
56 #include <sys/mutex.h>
57 #include <sys/rwlock.h>
58 #include <sys/sbuf.h>
59 #include <sys/taskqueue.h>
60 #include <sys/tree.h>
61 #include <vm/uma.h>
62 
63 #ifdef RCTL
64 #ifndef RACCT
65 #error "The RCTL option requires the RACCT option"
66 #endif
67 
68 FEATURE(rctl, "Resource Limits");
69 
70 #define	HRF_DEFAULT		0
71 #define	HRF_DONT_INHERIT	1
72 #define	HRF_DONT_ACCUMULATE	2
73 
74 #define	RCTL_MAX_INBUFSIZE	4 * 1024
75 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
76 #define	RCTL_LOG_BUFSIZE	128
77 
78 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
79 
80 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81 static int rctl_log_rate_limit = 10;
82 static int rctl_devctl_rate_limit = 10;
83 
84 /*
85  * Values below are initialized in rctl_init().
86  */
87 static int rctl_throttle_min = -1;
88 static int rctl_throttle_max = -1;
89 static int rctl_throttle_pct = -1;
90 static int rctl_throttle_pct2 = -1;
91 
92 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96 
97 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
98     "Resource Limits");
99 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
100     &rctl_maxbufsize, 0, "Maximum output buffer size");
101 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
102     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
103 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
104     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
105 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
106     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
107     &rctl_throttle_min_sysctl, "IU",
108     "Shortest throttling duration, in hz");
109 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
110 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
111     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
112     &rctl_throttle_max_sysctl, "IU",
113     "Longest throttling duration, in hz");
114 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
115 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
116     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
117     &rctl_throttle_pct_sysctl, "IU",
118     "Throttling penalty for process consumption, in percent");
119 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
120 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
121     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
122     &rctl_throttle_pct2_sysctl, "IU",
123     "Throttling penalty for container consumption, in percent");
124 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
125 
126 /*
127  * 'rctl_rule_link' connects a rule with every racct it's related to.
128  * For example, rule 'user:X:openfiles:deny=N/process' is linked
129  * with uidinfo for user X, and to each process of that user.
130  */
131 struct rctl_rule_link {
132 	LIST_ENTRY(rctl_rule_link)	rrl_next;
133 	struct rctl_rule		*rrl_rule;
134 	int				rrl_exceeded;
135 };
136 
137 struct dict {
138 	const char	*d_name;
139 	int		d_value;
140 };
141 
142 static struct dict subjectnames[] = {
143 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
144 	{ "user", RCTL_SUBJECT_TYPE_USER },
145 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
146 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
147 	{ NULL, -1 }};
148 
149 static struct dict resourcenames[] = {
150 	{ "cputime", RACCT_CPU },
151 	{ "datasize", RACCT_DATA },
152 	{ "stacksize", RACCT_STACK },
153 	{ "coredumpsize", RACCT_CORE },
154 	{ "memoryuse", RACCT_RSS },
155 	{ "memorylocked", RACCT_MEMLOCK },
156 	{ "maxproc", RACCT_NPROC },
157 	{ "openfiles", RACCT_NOFILE },
158 	{ "vmemoryuse", RACCT_VMEM },
159 	{ "pseudoterminals", RACCT_NPTS },
160 	{ "swapuse", RACCT_SWAP },
161 	{ "nthr", RACCT_NTHR },
162 	{ "msgqqueued", RACCT_MSGQQUEUED },
163 	{ "msgqsize", RACCT_MSGQSIZE },
164 	{ "nmsgq", RACCT_NMSGQ },
165 	{ "nsem", RACCT_NSEM },
166 	{ "nsemop", RACCT_NSEMOP },
167 	{ "nshm", RACCT_NSHM },
168 	{ "shmsize", RACCT_SHMSIZE },
169 	{ "wallclock", RACCT_WALLCLOCK },
170 	{ "pcpu", RACCT_PCTCPU },
171 	{ "readbps", RACCT_READBPS },
172 	{ "writebps", RACCT_WRITEBPS },
173 	{ "readiops", RACCT_READIOPS },
174 	{ "writeiops", RACCT_WRITEIOPS },
175 	{ NULL, -1 }};
176 
177 static struct dict actionnames[] = {
178 	{ "sighup", RCTL_ACTION_SIGHUP },
179 	{ "sigint", RCTL_ACTION_SIGINT },
180 	{ "sigquit", RCTL_ACTION_SIGQUIT },
181 	{ "sigill", RCTL_ACTION_SIGILL },
182 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
183 	{ "sigabrt", RCTL_ACTION_SIGABRT },
184 	{ "sigemt", RCTL_ACTION_SIGEMT },
185 	{ "sigfpe", RCTL_ACTION_SIGFPE },
186 	{ "sigkill", RCTL_ACTION_SIGKILL },
187 	{ "sigbus", RCTL_ACTION_SIGBUS },
188 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
189 	{ "sigsys", RCTL_ACTION_SIGSYS },
190 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
191 	{ "sigalrm", RCTL_ACTION_SIGALRM },
192 	{ "sigterm", RCTL_ACTION_SIGTERM },
193 	{ "sigurg", RCTL_ACTION_SIGURG },
194 	{ "sigstop", RCTL_ACTION_SIGSTOP },
195 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
196 	{ "sigchld", RCTL_ACTION_SIGCHLD },
197 	{ "sigttin", RCTL_ACTION_SIGTTIN },
198 	{ "sigttou", RCTL_ACTION_SIGTTOU },
199 	{ "sigio", RCTL_ACTION_SIGIO },
200 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
201 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
202 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
203 	{ "sigprof", RCTL_ACTION_SIGPROF },
204 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
205 	{ "siginfo", RCTL_ACTION_SIGINFO },
206 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
207 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
208 	{ "sigthr", RCTL_ACTION_SIGTHR },
209 	{ "deny", RCTL_ACTION_DENY },
210 	{ "log", RCTL_ACTION_LOG },
211 	{ "devctl", RCTL_ACTION_DEVCTL },
212 	{ "throttle", RCTL_ACTION_THROTTLE },
213 	{ NULL, -1 }};
214 
215 static void rctl_init(void);
216 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
217 
218 static uma_zone_t rctl_rule_zone;
219 static uma_zone_t rctl_rule_link_zone;
220 
221 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
222 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
223 
224 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
225 
226 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
227 {
228 	int error, val = rctl_throttle_min;
229 
230 	error = sysctl_handle_int(oidp, &val, 0, req);
231 	if (error || !req->newptr)
232 		return (error);
233 	if (val < 1 || val > rctl_throttle_max)
234 		return (EINVAL);
235 
236 	RACCT_LOCK();
237 	rctl_throttle_min = val;
238 	RACCT_UNLOCK();
239 
240 	return (0);
241 }
242 
243 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
244 {
245 	int error, val = rctl_throttle_max;
246 
247 	error = sysctl_handle_int(oidp, &val, 0, req);
248 	if (error || !req->newptr)
249 		return (error);
250 	if (val < rctl_throttle_min)
251 		return (EINVAL);
252 
253 	RACCT_LOCK();
254 	rctl_throttle_max = val;
255 	RACCT_UNLOCK();
256 
257 	return (0);
258 }
259 
260 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
261 {
262 	int error, val = rctl_throttle_pct;
263 
264 	error = sysctl_handle_int(oidp, &val, 0, req);
265 	if (error || !req->newptr)
266 		return (error);
267 	if (val < 0)
268 		return (EINVAL);
269 
270 	RACCT_LOCK();
271 	rctl_throttle_pct = val;
272 	RACCT_UNLOCK();
273 
274 	return (0);
275 }
276 
277 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
278 {
279 	int error, val = rctl_throttle_pct2;
280 
281 	error = sysctl_handle_int(oidp, &val, 0, req);
282 	if (error || !req->newptr)
283 		return (error);
284 	if (val < 0)
285 		return (EINVAL);
286 
287 	RACCT_LOCK();
288 	rctl_throttle_pct2 = val;
289 	RACCT_UNLOCK();
290 
291 	return (0);
292 }
293 
294 static const char *
295 rctl_subject_type_name(int subject)
296 {
297 	int i;
298 
299 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
300 		if (subjectnames[i].d_value == subject)
301 			return (subjectnames[i].d_name);
302 	}
303 
304 	panic("rctl_subject_type_name: unknown subject type %d", subject);
305 }
306 
307 static const char *
308 rctl_action_name(int action)
309 {
310 	int i;
311 
312 	for (i = 0; actionnames[i].d_name != NULL; i++) {
313 		if (actionnames[i].d_value == action)
314 			return (actionnames[i].d_name);
315 	}
316 
317 	panic("rctl_action_name: unknown action %d", action);
318 }
319 
320 const char *
321 rctl_resource_name(int resource)
322 {
323 	int i;
324 
325 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
326 		if (resourcenames[i].d_value == resource)
327 			return (resourcenames[i].d_name);
328 	}
329 
330 	panic("rctl_resource_name: unknown resource %d", resource);
331 }
332 
333 static struct racct *
334 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
335 {
336 	struct ucred *cred = p->p_ucred;
337 
338 	ASSERT_RACCT_ENABLED();
339 	RACCT_LOCK_ASSERT();
340 
341 	switch (rule->rr_per) {
342 	case RCTL_SUBJECT_TYPE_PROCESS:
343 		return (p->p_racct);
344 	case RCTL_SUBJECT_TYPE_USER:
345 		return (cred->cr_ruidinfo->ui_racct);
346 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
347 		return (cred->cr_loginclass->lc_racct);
348 	case RCTL_SUBJECT_TYPE_JAIL:
349 		return (cred->cr_prison->pr_prison_racct->prr_racct);
350 	default:
351 		panic("%s: unknown per %d", __func__, rule->rr_per);
352 	}
353 }
354 
355 /*
356  * Return the amount of resource that can be allocated by 'p' before
357  * hitting 'rule'.
358  */
359 static int64_t
360 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
361 {
362 	const struct racct *racct;
363 	int64_t available;
364 
365 	ASSERT_RACCT_ENABLED();
366 	RACCT_LOCK_ASSERT();
367 
368 	racct = rctl_proc_rule_to_racct(p, rule);
369 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
370 
371 	return (available);
372 }
373 
374 /*
375  * Called every second for proc, uidinfo, loginclass, and jail containers.
376  * If the limit isn't exceeded, it decreases the usage amount to zero.
377  * Otherwise, it decreases it by the value of the limit.  This way
378  * resource consumption exceeding the limit "carries over" to the next
379  * period.
380  */
381 void
382 rctl_throttle_decay(struct racct *racct, int resource)
383 {
384 	struct rctl_rule *rule;
385 	struct rctl_rule_link *link;
386 	int64_t minavailable;
387 
388 	ASSERT_RACCT_ENABLED();
389 	RACCT_LOCK_ASSERT();
390 
391 	minavailable = INT64_MAX;
392 
393 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
394 		rule = link->rrl_rule;
395 
396 		if (rule->rr_resource != resource)
397 			continue;
398 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
399 			continue;
400 
401 		if (rule->rr_amount < minavailable)
402 			minavailable = rule->rr_amount;
403 	}
404 
405 	if (racct->r_resources[resource] < minavailable) {
406 		racct->r_resources[resource] = 0;
407 	} else {
408 		/*
409 		 * Cap utilization counter at ten times the limit.  Otherwise,
410 		 * if we changed the rule lowering the allowed amount, it could
411 		 * take unreasonably long time for the accumulated resource
412 		 * usage to drop.
413 		 */
414 		if (racct->r_resources[resource] > minavailable * 10)
415 			racct->r_resources[resource] = minavailable * 10;
416 
417 		racct->r_resources[resource] -= minavailable;
418 	}
419 }
420 
421 /*
422  * Special version of rctl_get_available() for the %CPU resource.
423  * We slightly cheat here and return less than we normally would.
424  */
425 int64_t
426 rctl_pcpu_available(const struct proc *p) {
427 	struct rctl_rule *rule;
428 	struct rctl_rule_link *link;
429 	int64_t available, minavailable, limit;
430 
431 	ASSERT_RACCT_ENABLED();
432 	RACCT_LOCK_ASSERT();
433 
434 	minavailable = INT64_MAX;
435 	limit = 0;
436 
437 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
438 		rule = link->rrl_rule;
439 		if (rule->rr_resource != RACCT_PCTCPU)
440 			continue;
441 		if (rule->rr_action != RCTL_ACTION_DENY)
442 			continue;
443 		available = rctl_available_resource(p, rule);
444 		if (available < minavailable) {
445 			minavailable = available;
446 			limit = rule->rr_amount;
447 		}
448 	}
449 
450 	/*
451 	 * Return slightly less than actual value of the available
452 	 * %cpu resource.  This makes %cpu throttling more aggressive
453 	 * and lets us act sooner than the limits are already exceeded.
454 	 */
455 	if (limit != 0) {
456 		if (limit > 2 * RCTL_PCPU_SHIFT)
457 			minavailable -= RCTL_PCPU_SHIFT;
458 		else
459 			minavailable -= (limit / 2);
460 	}
461 
462 	return (minavailable);
463 }
464 
465 static uint64_t
466 xadd(uint64_t a, uint64_t b)
467 {
468 	uint64_t c;
469 
470 	c = a + b;
471 
472 	/*
473 	 * Detect overflow.
474 	 */
475 	if (c < a || c < b)
476 		return (UINT64_MAX);
477 
478 	return (c);
479 }
480 
481 static uint64_t
482 xmul(uint64_t a, uint64_t b)
483 {
484 
485 	if (b != 0 && a > UINT64_MAX / b)
486 		return (UINT64_MAX);
487 
488 	return (a * b);
489 }
490 
491 /*
492  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
493  * to what it keeps allocated now.  Returns non-zero if the allocation should
494  * be denied, 0 otherwise.
495  */
496 int
497 rctl_enforce(struct proc *p, int resource, uint64_t amount)
498 {
499 	static struct timeval log_lasttime, devctl_lasttime;
500 	static int log_curtime = 0, devctl_curtime = 0;
501 	struct rctl_rule *rule;
502 	struct rctl_rule_link *link;
503 	struct sbuf sb;
504 	char *buf;
505 	int64_t available;
506 	uint64_t sleep_ms, sleep_ratio;
507 	int should_deny = 0;
508 
509 	ASSERT_RACCT_ENABLED();
510 	RACCT_LOCK_ASSERT();
511 
512 	/*
513 	 * There may be more than one matching rule; go through all of them.
514 	 * Denial should be done last, after logging and sending signals.
515 	 */
516 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
517 		rule = link->rrl_rule;
518 		if (rule->rr_resource != resource)
519 			continue;
520 
521 		available = rctl_available_resource(p, rule);
522 		if (available >= (int64_t)amount) {
523 			link->rrl_exceeded = 0;
524 			continue;
525 		}
526 
527 		switch (rule->rr_action) {
528 		case RCTL_ACTION_DENY:
529 			should_deny = 1;
530 			continue;
531 		case RCTL_ACTION_LOG:
532 			/*
533 			 * If rrl_exceeded != 0, it means we've already
534 			 * logged a warning for this process.
535 			 */
536 			if (link->rrl_exceeded != 0)
537 				continue;
538 
539 			/*
540 			 * If the process state is not fully initialized yet,
541 			 * we can't access most of the required fields, e.g.
542 			 * p->p_comm.  This happens when called from fork1().
543 			 * Ignore this rule for now; it will be processed just
544 			 * after fork, when called from racct_proc_fork_done().
545 			 */
546 			if (p->p_state != PRS_NORMAL)
547 				continue;
548 
549 			if (!ppsratecheck(&log_lasttime, &log_curtime,
550 			    rctl_log_rate_limit))
551 				continue;
552 
553 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
554 			if (buf == NULL) {
555 				printf("rctl_enforce: out of memory\n");
556 				continue;
557 			}
558 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
559 			rctl_rule_to_sbuf(&sb, rule);
560 			sbuf_finish(&sb);
561 			printf("rctl: rule \"%s\" matched by pid %d "
562 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
563 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
564 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
565 			sbuf_delete(&sb);
566 			free(buf, M_RCTL);
567 			link->rrl_exceeded = 1;
568 			continue;
569 		case RCTL_ACTION_DEVCTL:
570 			if (link->rrl_exceeded != 0)
571 				continue;
572 
573 			if (p->p_state != PRS_NORMAL)
574 				continue;
575 
576 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
577 			    rctl_devctl_rate_limit))
578 				continue;
579 
580 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
581 			if (buf == NULL) {
582 				printf("rctl_enforce: out of memory\n");
583 				continue;
584 			}
585 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
586 			sbuf_printf(&sb, "rule=");
587 			rctl_rule_to_sbuf(&sb, rule);
588 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
589 			    p->p_pid, p->p_ucred->cr_ruid,
590 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
591 			sbuf_finish(&sb);
592 			devctl_notify("RCTL", "rule", "matched",
593 			    sbuf_data(&sb));
594 			sbuf_delete(&sb);
595 			free(buf, M_RCTL);
596 			link->rrl_exceeded = 1;
597 			continue;
598 		case RCTL_ACTION_THROTTLE:
599 			if (p->p_state != PRS_NORMAL)
600 				continue;
601 
602 			if (rule->rr_amount == 0) {
603 				racct_proc_throttle(p, rctl_throttle_max);
604 				continue;
605 			}
606 
607 			/*
608 			 * Make the process sleep for a fraction of second
609 			 * proportional to the ratio of process' resource
610 			 * utilization compared to the limit.  The point is
611 			 * to penalize resource hogs: processes that consume
612 			 * more of the available resources sleep for longer.
613 			 *
614 			 * We're trying to defer division until the very end,
615 			 * to minimize the rounding effects.  The following
616 			 * calculation could have been written in a clearer
617 			 * way like this:
618 			 *
619 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
620 			 *     rule->rr_amount;
621 			 * sleep_ms *= rctl_throttle_pct / 100;
622 			 * if (sleep_ms < rctl_throttle_min)
623 			 *         sleep_ms = rctl_throttle_min;
624 			 *
625 			 */
626 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
627 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
628 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
629 				sleep_ms = rctl_throttle_min * rule->rr_amount;
630 
631 			/*
632 			 * Multiply that by the ratio of the resource
633 			 * consumption for the container compared to the limit,
634 			 * squared.  In other words, a process in a container
635 			 * that is two times over the limit will be throttled
636 			 * four times as much for hitting the same rule.  The
637 			 * point is to penalize processes more if the container
638 			 * itself (eg certain UID or jail) is above the limit.
639 			 */
640 			if (available < 0)
641 				sleep_ratio = -available / rule->rr_amount;
642 			else
643 				sleep_ratio = 0;
644 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
645 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
646 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
647 
648 			/*
649 			 * Finally the division.
650 			 */
651 			sleep_ms /= rule->rr_amount;
652 
653 			if (sleep_ms > rctl_throttle_max)
654 				sleep_ms = rctl_throttle_max;
655 #if 0
656 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
657 			   __func__, p->p_pid, p->p_comm,
658 			   p->p_racct->r_resources[resource],
659 			   rule->rr_amount, (uintmax_t)sleep_ms,
660 			   (uintmax_t)sleep_ratio, (intmax_t)available);
661 #endif
662 
663 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
664 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
665 			racct_proc_throttle(p, sleep_ms);
666 			continue;
667 		default:
668 			if (link->rrl_exceeded != 0)
669 				continue;
670 
671 			if (p->p_state != PRS_NORMAL)
672 				continue;
673 
674 			KASSERT(rule->rr_action > 0 &&
675 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
676 			    ("rctl_enforce: unknown action %d",
677 			     rule->rr_action));
678 
679 			/*
680 			 * We're using the fact that RCTL_ACTION_SIG* values
681 			 * are equal to their counterparts from sys/signal.h.
682 			 */
683 			kern_psignal(p, rule->rr_action);
684 			link->rrl_exceeded = 1;
685 			continue;
686 		}
687 	}
688 
689 	if (should_deny) {
690 		/*
691 		 * Return fake error code; the caller should change it
692 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
693 		 */
694 		return (EDOOFUS);
695 	}
696 
697 	return (0);
698 }
699 
700 uint64_t
701 rctl_get_limit(struct proc *p, int resource)
702 {
703 	struct rctl_rule *rule;
704 	struct rctl_rule_link *link;
705 	uint64_t amount = UINT64_MAX;
706 
707 	ASSERT_RACCT_ENABLED();
708 	RACCT_LOCK_ASSERT();
709 
710 	/*
711 	 * There may be more than one matching rule; go through all of them.
712 	 * Denial should be done last, after logging and sending signals.
713 	 */
714 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
715 		rule = link->rrl_rule;
716 		if (rule->rr_resource != resource)
717 			continue;
718 		if (rule->rr_action != RCTL_ACTION_DENY)
719 			continue;
720 		if (rule->rr_amount < amount)
721 			amount = rule->rr_amount;
722 	}
723 
724 	return (amount);
725 }
726 
727 uint64_t
728 rctl_get_available(struct proc *p, int resource)
729 {
730 	struct rctl_rule *rule;
731 	struct rctl_rule_link *link;
732 	int64_t available, minavailable, allocated;
733 
734 	minavailable = INT64_MAX;
735 
736 	ASSERT_RACCT_ENABLED();
737 	RACCT_LOCK_ASSERT();
738 
739 	/*
740 	 * There may be more than one matching rule; go through all of them.
741 	 * Denial should be done last, after logging and sending signals.
742 	 */
743 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
744 		rule = link->rrl_rule;
745 		if (rule->rr_resource != resource)
746 			continue;
747 		if (rule->rr_action != RCTL_ACTION_DENY)
748 			continue;
749 		available = rctl_available_resource(p, rule);
750 		if (available < minavailable)
751 			minavailable = available;
752 	}
753 
754 	/*
755 	 * XXX: Think about this _hard_.
756 	 */
757 	allocated = p->p_racct->r_resources[resource];
758 	if (minavailable < INT64_MAX - allocated)
759 		minavailable += allocated;
760 	if (minavailable < 0)
761 		minavailable = 0;
762 
763 	return (minavailable);
764 }
765 
766 static int
767 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
768 {
769 
770 	ASSERT_RACCT_ENABLED();
771 
772 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
773 		if (rule->rr_subject_type != filter->rr_subject_type)
774 			return (0);
775 
776 		switch (filter->rr_subject_type) {
777 		case RCTL_SUBJECT_TYPE_PROCESS:
778 			if (filter->rr_subject.rs_proc != NULL &&
779 			    rule->rr_subject.rs_proc !=
780 			    filter->rr_subject.rs_proc)
781 				return (0);
782 			break;
783 		case RCTL_SUBJECT_TYPE_USER:
784 			if (filter->rr_subject.rs_uip != NULL &&
785 			    rule->rr_subject.rs_uip !=
786 			    filter->rr_subject.rs_uip)
787 				return (0);
788 			break;
789 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
790 			if (filter->rr_subject.rs_loginclass != NULL &&
791 			    rule->rr_subject.rs_loginclass !=
792 			    filter->rr_subject.rs_loginclass)
793 				return (0);
794 			break;
795 		case RCTL_SUBJECT_TYPE_JAIL:
796 			if (filter->rr_subject.rs_prison_racct != NULL &&
797 			    rule->rr_subject.rs_prison_racct !=
798 			    filter->rr_subject.rs_prison_racct)
799 				return (0);
800 			break;
801 		default:
802 			panic("rctl_rule_matches: unknown subject type %d",
803 			    filter->rr_subject_type);
804 		}
805 	}
806 
807 	if (filter->rr_resource != RACCT_UNDEFINED) {
808 		if (rule->rr_resource != filter->rr_resource)
809 			return (0);
810 	}
811 
812 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
813 		if (rule->rr_action != filter->rr_action)
814 			return (0);
815 	}
816 
817 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
818 		if (rule->rr_amount != filter->rr_amount)
819 			return (0);
820 	}
821 
822 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
823 		if (rule->rr_per != filter->rr_per)
824 			return (0);
825 	}
826 
827 	return (1);
828 }
829 
830 static int
831 str2value(const char *str, int *value, struct dict *table)
832 {
833 	int i;
834 
835 	if (value == NULL)
836 		return (EINVAL);
837 
838 	for (i = 0; table[i].d_name != NULL; i++) {
839 		if (strcasecmp(table[i].d_name, str) == 0) {
840 			*value =  table[i].d_value;
841 			return (0);
842 		}
843 	}
844 
845 	return (EINVAL);
846 }
847 
848 static int
849 str2id(const char *str, id_t *value)
850 {
851 	char *end;
852 
853 	if (str == NULL)
854 		return (EINVAL);
855 
856 	*value = strtoul(str, &end, 10);
857 	if ((size_t)(end - str) != strlen(str))
858 		return (EINVAL);
859 
860 	return (0);
861 }
862 
863 static int
864 str2int64(const char *str, int64_t *value)
865 {
866 	char *end;
867 
868 	if (str == NULL)
869 		return (EINVAL);
870 
871 	*value = strtoul(str, &end, 10);
872 	if ((size_t)(end - str) != strlen(str))
873 		return (EINVAL);
874 
875 	if (*value < 0)
876 		return (ERANGE);
877 
878 	return (0);
879 }
880 
881 /*
882  * Connect the rule to the racct, increasing refcount for the rule.
883  */
884 static void
885 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
886 {
887 	struct rctl_rule_link *link;
888 
889 	ASSERT_RACCT_ENABLED();
890 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
891 
892 	rctl_rule_acquire(rule);
893 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
894 	link->rrl_rule = rule;
895 	link->rrl_exceeded = 0;
896 
897 	RACCT_LOCK();
898 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
899 	RACCT_UNLOCK();
900 }
901 
902 static int
903 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
904 {
905 	struct rctl_rule_link *link;
906 
907 	ASSERT_RACCT_ENABLED();
908 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
909 	RACCT_LOCK_ASSERT();
910 
911 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
912 	if (link == NULL)
913 		return (ENOMEM);
914 	rctl_rule_acquire(rule);
915 	link->rrl_rule = rule;
916 	link->rrl_exceeded = 0;
917 
918 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
919 
920 	return (0);
921 }
922 
923 /*
924  * Remove limits for a rules matching the filter and release
925  * the refcounts for the rules, possibly freeing them.  Returns
926  * the number of limit structures removed.
927  */
928 static int
929 rctl_racct_remove_rules(struct racct *racct,
930     const struct rctl_rule *filter)
931 {
932 	struct rctl_rule_link *link, *linktmp;
933 	int removed = 0;
934 
935 	ASSERT_RACCT_ENABLED();
936 	RACCT_LOCK_ASSERT();
937 
938 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
939 		if (!rctl_rule_matches(link->rrl_rule, filter))
940 			continue;
941 
942 		LIST_REMOVE(link, rrl_next);
943 		rctl_rule_release(link->rrl_rule);
944 		uma_zfree(rctl_rule_link_zone, link);
945 		removed++;
946 	}
947 	return (removed);
948 }
949 
950 static void
951 rctl_rule_acquire_subject(struct rctl_rule *rule)
952 {
953 
954 	ASSERT_RACCT_ENABLED();
955 
956 	switch (rule->rr_subject_type) {
957 	case RCTL_SUBJECT_TYPE_UNDEFINED:
958 	case RCTL_SUBJECT_TYPE_PROCESS:
959 		break;
960 	case RCTL_SUBJECT_TYPE_JAIL:
961 		if (rule->rr_subject.rs_prison_racct != NULL)
962 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
963 		break;
964 	case RCTL_SUBJECT_TYPE_USER:
965 		if (rule->rr_subject.rs_uip != NULL)
966 			uihold(rule->rr_subject.rs_uip);
967 		break;
968 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
969 		if (rule->rr_subject.rs_loginclass != NULL)
970 			loginclass_hold(rule->rr_subject.rs_loginclass);
971 		break;
972 	default:
973 		panic("rctl_rule_acquire_subject: unknown subject type %d",
974 		    rule->rr_subject_type);
975 	}
976 }
977 
978 static void
979 rctl_rule_release_subject(struct rctl_rule *rule)
980 {
981 
982 	ASSERT_RACCT_ENABLED();
983 
984 	switch (rule->rr_subject_type) {
985 	case RCTL_SUBJECT_TYPE_UNDEFINED:
986 	case RCTL_SUBJECT_TYPE_PROCESS:
987 		break;
988 	case RCTL_SUBJECT_TYPE_JAIL:
989 		if (rule->rr_subject.rs_prison_racct != NULL)
990 			prison_racct_free(rule->rr_subject.rs_prison_racct);
991 		break;
992 	case RCTL_SUBJECT_TYPE_USER:
993 		if (rule->rr_subject.rs_uip != NULL)
994 			uifree(rule->rr_subject.rs_uip);
995 		break;
996 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
997 		if (rule->rr_subject.rs_loginclass != NULL)
998 			loginclass_free(rule->rr_subject.rs_loginclass);
999 		break;
1000 	default:
1001 		panic("rctl_rule_release_subject: unknown subject type %d",
1002 		    rule->rr_subject_type);
1003 	}
1004 }
1005 
1006 struct rctl_rule *
1007 rctl_rule_alloc(int flags)
1008 {
1009 	struct rctl_rule *rule;
1010 
1011 	ASSERT_RACCT_ENABLED();
1012 
1013 	rule = uma_zalloc(rctl_rule_zone, flags);
1014 	if (rule == NULL)
1015 		return (NULL);
1016 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1017 	rule->rr_subject.rs_proc = NULL;
1018 	rule->rr_subject.rs_uip = NULL;
1019 	rule->rr_subject.rs_loginclass = NULL;
1020 	rule->rr_subject.rs_prison_racct = NULL;
1021 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1022 	rule->rr_resource = RACCT_UNDEFINED;
1023 	rule->rr_action = RCTL_ACTION_UNDEFINED;
1024 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1025 	refcount_init(&rule->rr_refcount, 1);
1026 
1027 	return (rule);
1028 }
1029 
1030 struct rctl_rule *
1031 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1032 {
1033 	struct rctl_rule *copy;
1034 
1035 	ASSERT_RACCT_ENABLED();
1036 
1037 	copy = uma_zalloc(rctl_rule_zone, flags);
1038 	if (copy == NULL)
1039 		return (NULL);
1040 	copy->rr_subject_type = rule->rr_subject_type;
1041 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1042 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1043 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1044 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1045 	copy->rr_per = rule->rr_per;
1046 	copy->rr_resource = rule->rr_resource;
1047 	copy->rr_action = rule->rr_action;
1048 	copy->rr_amount = rule->rr_amount;
1049 	refcount_init(&copy->rr_refcount, 1);
1050 	rctl_rule_acquire_subject(copy);
1051 
1052 	return (copy);
1053 }
1054 
1055 void
1056 rctl_rule_acquire(struct rctl_rule *rule)
1057 {
1058 
1059 	ASSERT_RACCT_ENABLED();
1060 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1061 
1062 	refcount_acquire(&rule->rr_refcount);
1063 }
1064 
1065 static void
1066 rctl_rule_free(void *context, int pending)
1067 {
1068 	struct rctl_rule *rule;
1069 
1070 	rule = (struct rctl_rule *)context;
1071 
1072 	ASSERT_RACCT_ENABLED();
1073 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1074 
1075 	/*
1076 	 * We don't need locking here; rule is guaranteed to be inaccessible.
1077 	 */
1078 
1079 	rctl_rule_release_subject(rule);
1080 	uma_zfree(rctl_rule_zone, rule);
1081 }
1082 
1083 void
1084 rctl_rule_release(struct rctl_rule *rule)
1085 {
1086 
1087 	ASSERT_RACCT_ENABLED();
1088 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1089 
1090 	if (refcount_release(&rule->rr_refcount)) {
1091 		/*
1092 		 * rctl_rule_release() is often called when iterating
1093 		 * over all the uidinfo structures in the system,
1094 		 * holding uihashtbl_lock.  Since rctl_rule_free()
1095 		 * might end up calling uifree(), this would lead
1096 		 * to lock recursion.  Use taskqueue to avoid this.
1097 		 */
1098 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1099 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1100 	}
1101 }
1102 
1103 static int
1104 rctl_rule_fully_specified(const struct rctl_rule *rule)
1105 {
1106 
1107 	ASSERT_RACCT_ENABLED();
1108 
1109 	switch (rule->rr_subject_type) {
1110 	case RCTL_SUBJECT_TYPE_UNDEFINED:
1111 		return (0);
1112 	case RCTL_SUBJECT_TYPE_PROCESS:
1113 		if (rule->rr_subject.rs_proc == NULL)
1114 			return (0);
1115 		break;
1116 	case RCTL_SUBJECT_TYPE_USER:
1117 		if (rule->rr_subject.rs_uip == NULL)
1118 			return (0);
1119 		break;
1120 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1121 		if (rule->rr_subject.rs_loginclass == NULL)
1122 			return (0);
1123 		break;
1124 	case RCTL_SUBJECT_TYPE_JAIL:
1125 		if (rule->rr_subject.rs_prison_racct == NULL)
1126 			return (0);
1127 		break;
1128 	default:
1129 		panic("rctl_rule_fully_specified: unknown subject type %d",
1130 		    rule->rr_subject_type);
1131 	}
1132 	if (rule->rr_resource == RACCT_UNDEFINED)
1133 		return (0);
1134 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1135 		return (0);
1136 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1137 		return (0);
1138 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1139 		return (0);
1140 
1141 	return (1);
1142 }
1143 
1144 static int
1145 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1146 {
1147 	struct rctl_rule *rule;
1148 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1149 	     *amountstr, *perstr;
1150 	id_t id;
1151 	int error = 0;
1152 
1153 	ASSERT_RACCT_ENABLED();
1154 
1155 	rule = rctl_rule_alloc(M_WAITOK);
1156 
1157 	subjectstr = strsep(&rulestr, ":");
1158 	subject_idstr = strsep(&rulestr, ":");
1159 	resourcestr = strsep(&rulestr, ":");
1160 	actionstr = strsep(&rulestr, "=/");
1161 	amountstr = strsep(&rulestr, "/");
1162 	perstr = rulestr;
1163 
1164 	if (subjectstr == NULL || subjectstr[0] == '\0')
1165 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1166 	else {
1167 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1168 		if (error != 0)
1169 			goto out;
1170 	}
1171 
1172 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1173 		rule->rr_subject.rs_proc = NULL;
1174 		rule->rr_subject.rs_uip = NULL;
1175 		rule->rr_subject.rs_loginclass = NULL;
1176 		rule->rr_subject.rs_prison_racct = NULL;
1177 	} else {
1178 		switch (rule->rr_subject_type) {
1179 		case RCTL_SUBJECT_TYPE_UNDEFINED:
1180 			error = EINVAL;
1181 			goto out;
1182 		case RCTL_SUBJECT_TYPE_PROCESS:
1183 			error = str2id(subject_idstr, &id);
1184 			if (error != 0)
1185 				goto out;
1186 			sx_assert(&allproc_lock, SA_LOCKED);
1187 			rule->rr_subject.rs_proc = pfind(id);
1188 			if (rule->rr_subject.rs_proc == NULL) {
1189 				error = ESRCH;
1190 				goto out;
1191 			}
1192 			PROC_UNLOCK(rule->rr_subject.rs_proc);
1193 			break;
1194 		case RCTL_SUBJECT_TYPE_USER:
1195 			error = str2id(subject_idstr, &id);
1196 			if (error != 0)
1197 				goto out;
1198 			rule->rr_subject.rs_uip = uifind(id);
1199 			break;
1200 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1201 			rule->rr_subject.rs_loginclass =
1202 			    loginclass_find(subject_idstr);
1203 			if (rule->rr_subject.rs_loginclass == NULL) {
1204 				error = ENAMETOOLONG;
1205 				goto out;
1206 			}
1207 			break;
1208 		case RCTL_SUBJECT_TYPE_JAIL:
1209 			rule->rr_subject.rs_prison_racct =
1210 			    prison_racct_find(subject_idstr);
1211 			if (rule->rr_subject.rs_prison_racct == NULL) {
1212 				error = ENAMETOOLONG;
1213 				goto out;
1214 			}
1215 			break;
1216                default:
1217                        panic("rctl_string_to_rule: unknown subject type %d",
1218                            rule->rr_subject_type);
1219                }
1220 	}
1221 
1222 	if (resourcestr == NULL || resourcestr[0] == '\0')
1223 		rule->rr_resource = RACCT_UNDEFINED;
1224 	else {
1225 		error = str2value(resourcestr, &rule->rr_resource,
1226 		    resourcenames);
1227 		if (error != 0)
1228 			goto out;
1229 	}
1230 
1231 	if (actionstr == NULL || actionstr[0] == '\0')
1232 		rule->rr_action = RCTL_ACTION_UNDEFINED;
1233 	else {
1234 		error = str2value(actionstr, &rule->rr_action, actionnames);
1235 		if (error != 0)
1236 			goto out;
1237 	}
1238 
1239 	if (amountstr == NULL || amountstr[0] == '\0')
1240 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1241 	else {
1242 		error = str2int64(amountstr, &rule->rr_amount);
1243 		if (error != 0)
1244 			goto out;
1245 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1246 			if (rule->rr_amount > INT64_MAX / 1000000) {
1247 				error = ERANGE;
1248 				goto out;
1249 			}
1250 			rule->rr_amount *= 1000000;
1251 		}
1252 	}
1253 
1254 	if (perstr == NULL || perstr[0] == '\0')
1255 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1256 	else {
1257 		error = str2value(perstr, &rule->rr_per, subjectnames);
1258 		if (error != 0)
1259 			goto out;
1260 	}
1261 
1262 out:
1263 	if (error == 0)
1264 		*rulep = rule;
1265 	else
1266 		rctl_rule_release(rule);
1267 
1268 	return (error);
1269 }
1270 
1271 /*
1272  * Link a rule with all the subjects it applies to.
1273  */
1274 int
1275 rctl_rule_add(struct rctl_rule *rule)
1276 {
1277 	struct proc *p;
1278 	struct ucred *cred;
1279 	struct uidinfo *uip;
1280 	struct prison *pr;
1281 	struct prison_racct *prr;
1282 	struct loginclass *lc;
1283 	struct rctl_rule *rule2;
1284 	int match;
1285 
1286 	ASSERT_RACCT_ENABLED();
1287 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1288 
1289 	/*
1290 	 * Some rules just don't make sense, like "deny" rule for an undeniable
1291 	 * resource.  The exception are the RSS and %CPU resources - they are
1292 	 * not deniable in the racct sense, but the limit is enforced in
1293 	 * a different way.
1294 	 */
1295 	if (rule->rr_action == RCTL_ACTION_DENY &&
1296 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1297 	    rule->rr_resource != RACCT_RSS &&
1298 	    rule->rr_resource != RACCT_PCTCPU) {
1299 		return (EOPNOTSUPP);
1300 	}
1301 
1302 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1303 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1304 		return (EOPNOTSUPP);
1305 	}
1306 
1307 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1308 	    rule->rr_resource == RACCT_PCTCPU) {
1309 		return (EOPNOTSUPP);
1310 	}
1311 
1312 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1313 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1314 		return (EOPNOTSUPP);
1315 	}
1316 
1317 	/*
1318 	 * Make sure there are no duplicated rules.  Also, for the "deny"
1319 	 * rules, remove ones differing only by "amount".
1320 	 */
1321 	if (rule->rr_action == RCTL_ACTION_DENY) {
1322 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1323 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1324 		rctl_rule_remove(rule2);
1325 		rctl_rule_release(rule2);
1326 	} else
1327 		rctl_rule_remove(rule);
1328 
1329 	switch (rule->rr_subject_type) {
1330 	case RCTL_SUBJECT_TYPE_PROCESS:
1331 		p = rule->rr_subject.rs_proc;
1332 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1333 
1334 		rctl_racct_add_rule(p->p_racct, rule);
1335 		/*
1336 		 * In case of per-process rule, we don't have anything more
1337 		 * to do.
1338 		 */
1339 		return (0);
1340 
1341 	case RCTL_SUBJECT_TYPE_USER:
1342 		uip = rule->rr_subject.rs_uip;
1343 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1344 		rctl_racct_add_rule(uip->ui_racct, rule);
1345 		break;
1346 
1347 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1348 		lc = rule->rr_subject.rs_loginclass;
1349 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1350 		rctl_racct_add_rule(lc->lc_racct, rule);
1351 		break;
1352 
1353 	case RCTL_SUBJECT_TYPE_JAIL:
1354 		prr = rule->rr_subject.rs_prison_racct;
1355 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1356 		rctl_racct_add_rule(prr->prr_racct, rule);
1357 		break;
1358 
1359 	default:
1360 		panic("rctl_rule_add: unknown subject type %d",
1361 		    rule->rr_subject_type);
1362 	}
1363 
1364 	/*
1365 	 * Now go through all the processes and add the new rule to the ones
1366 	 * it applies to.
1367 	 */
1368 	sx_assert(&allproc_lock, SA_LOCKED);
1369 	FOREACH_PROC_IN_SYSTEM(p) {
1370 		cred = p->p_ucred;
1371 		switch (rule->rr_subject_type) {
1372 		case RCTL_SUBJECT_TYPE_USER:
1373 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1374 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1375 				break;
1376 			continue;
1377 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1378 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1379 				break;
1380 			continue;
1381 		case RCTL_SUBJECT_TYPE_JAIL:
1382 			match = 0;
1383 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1384 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1385 					match = 1;
1386 					break;
1387 				}
1388 			}
1389 			if (match)
1390 				break;
1391 			continue;
1392 		default:
1393 			panic("rctl_rule_add: unknown subject type %d",
1394 			    rule->rr_subject_type);
1395 		}
1396 
1397 		rctl_racct_add_rule(p->p_racct, rule);
1398 	}
1399 
1400 	return (0);
1401 }
1402 
1403 static void
1404 rctl_rule_pre_callback(void)
1405 {
1406 
1407 	RACCT_LOCK();
1408 }
1409 
1410 static void
1411 rctl_rule_post_callback(void)
1412 {
1413 
1414 	RACCT_UNLOCK();
1415 }
1416 
1417 static void
1418 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1419 {
1420 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1421 	int found = 0;
1422 
1423 	ASSERT_RACCT_ENABLED();
1424 	RACCT_LOCK_ASSERT();
1425 
1426 	found += rctl_racct_remove_rules(racct, filter);
1427 
1428 	*((int *)arg3) += found;
1429 }
1430 
1431 /*
1432  * Remove all rules that match the filter.
1433  */
1434 int
1435 rctl_rule_remove(struct rctl_rule *filter)
1436 {
1437 	struct proc *p;
1438 	int found = 0;
1439 
1440 	ASSERT_RACCT_ENABLED();
1441 
1442 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1443 	    filter->rr_subject.rs_proc != NULL) {
1444 		p = filter->rr_subject.rs_proc;
1445 		RACCT_LOCK();
1446 		found = rctl_racct_remove_rules(p->p_racct, filter);
1447 		RACCT_UNLOCK();
1448 		if (found)
1449 			return (0);
1450 		return (ESRCH);
1451 	}
1452 
1453 	loginclass_racct_foreach(rctl_rule_remove_callback,
1454 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1455 	    filter, (void *)&found);
1456 	ui_racct_foreach(rctl_rule_remove_callback,
1457 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1458 	    filter, (void *)&found);
1459 	prison_racct_foreach(rctl_rule_remove_callback,
1460 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1461 	    filter, (void *)&found);
1462 
1463 	sx_assert(&allproc_lock, SA_LOCKED);
1464 	RACCT_LOCK();
1465 	FOREACH_PROC_IN_SYSTEM(p) {
1466 		found += rctl_racct_remove_rules(p->p_racct, filter);
1467 	}
1468 	RACCT_UNLOCK();
1469 
1470 	if (found)
1471 		return (0);
1472 	return (ESRCH);
1473 }
1474 
1475 /*
1476  * Appends a rule to the sbuf.
1477  */
1478 static void
1479 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1480 {
1481 	int64_t amount;
1482 
1483 	ASSERT_RACCT_ENABLED();
1484 
1485 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1486 
1487 	switch (rule->rr_subject_type) {
1488 	case RCTL_SUBJECT_TYPE_PROCESS:
1489 		if (rule->rr_subject.rs_proc == NULL)
1490 			sbuf_printf(sb, ":");
1491 		else
1492 			sbuf_printf(sb, "%d:",
1493 			    rule->rr_subject.rs_proc->p_pid);
1494 		break;
1495 	case RCTL_SUBJECT_TYPE_USER:
1496 		if (rule->rr_subject.rs_uip == NULL)
1497 			sbuf_printf(sb, ":");
1498 		else
1499 			sbuf_printf(sb, "%d:",
1500 			    rule->rr_subject.rs_uip->ui_uid);
1501 		break;
1502 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1503 		if (rule->rr_subject.rs_loginclass == NULL)
1504 			sbuf_printf(sb, ":");
1505 		else
1506 			sbuf_printf(sb, "%s:",
1507 			    rule->rr_subject.rs_loginclass->lc_name);
1508 		break;
1509 	case RCTL_SUBJECT_TYPE_JAIL:
1510 		if (rule->rr_subject.rs_prison_racct == NULL)
1511 			sbuf_printf(sb, ":");
1512 		else
1513 			sbuf_printf(sb, "%s:",
1514 			    rule->rr_subject.rs_prison_racct->prr_name);
1515 		break;
1516 	default:
1517 		panic("rctl_rule_to_sbuf: unknown subject type %d",
1518 		    rule->rr_subject_type);
1519 	}
1520 
1521 	amount = rule->rr_amount;
1522 	if (amount != RCTL_AMOUNT_UNDEFINED &&
1523 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1524 		amount /= 1000000;
1525 
1526 	sbuf_printf(sb, "%s:%s=%jd",
1527 	    rctl_resource_name(rule->rr_resource),
1528 	    rctl_action_name(rule->rr_action),
1529 	    amount);
1530 
1531 	if (rule->rr_per != rule->rr_subject_type)
1532 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1533 }
1534 
1535 /*
1536  * Routine used by RCTL syscalls to read in input string.
1537  */
1538 static int
1539 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1540 {
1541 	char *str;
1542 	int error;
1543 
1544 	ASSERT_RACCT_ENABLED();
1545 
1546 	if (inbuflen <= 0)
1547 		return (EINVAL);
1548 	if (inbuflen > RCTL_MAX_INBUFSIZE)
1549 		return (E2BIG);
1550 
1551 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1552 	error = copyinstr(inbufp, str, inbuflen, NULL);
1553 	if (error != 0) {
1554 		free(str, M_RCTL);
1555 		return (error);
1556 	}
1557 
1558 	*inputstr = str;
1559 
1560 	return (0);
1561 }
1562 
1563 /*
1564  * Routine used by RCTL syscalls to write out output string.
1565  */
1566 static int
1567 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1568 {
1569 	int error;
1570 
1571 	ASSERT_RACCT_ENABLED();
1572 
1573 	if (outputsbuf == NULL)
1574 		return (0);
1575 
1576 	sbuf_finish(outputsbuf);
1577 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1578 		sbuf_delete(outputsbuf);
1579 		return (ERANGE);
1580 	}
1581 	error = copyout(sbuf_data(outputsbuf), outbufp,
1582 	    sbuf_len(outputsbuf) + 1);
1583 	sbuf_delete(outputsbuf);
1584 	return (error);
1585 }
1586 
1587 static struct sbuf *
1588 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1589 {
1590 	struct sbuf *sb;
1591 	int64_t amount;
1592 	int i;
1593 
1594 	ASSERT_RACCT_ENABLED();
1595 
1596 	sb = sbuf_new_auto();
1597 	for (i = 0; i <= RACCT_MAX; i++) {
1598 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1599 			continue;
1600 		RACCT_LOCK();
1601 		amount = racct->r_resources[i];
1602 		RACCT_UNLOCK();
1603 		if (RACCT_IS_IN_MILLIONS(i))
1604 			amount /= 1000000;
1605 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1606 	}
1607 	sbuf_setpos(sb, sbuf_len(sb) - 1);
1608 	return (sb);
1609 }
1610 
1611 int
1612 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1613 {
1614 	struct rctl_rule *filter;
1615 	struct sbuf *outputsbuf = NULL;
1616 	struct proc *p;
1617 	struct uidinfo *uip;
1618 	struct loginclass *lc;
1619 	struct prison_racct *prr;
1620 	char *inputstr;
1621 	int error;
1622 
1623 	if (!racct_enable)
1624 		return (ENOSYS);
1625 
1626 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1627 	if (error != 0)
1628 		return (error);
1629 
1630 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1631 	if (error != 0)
1632 		return (error);
1633 
1634 	sx_slock(&allproc_lock);
1635 	error = rctl_string_to_rule(inputstr, &filter);
1636 	free(inputstr, M_RCTL);
1637 	if (error != 0) {
1638 		sx_sunlock(&allproc_lock);
1639 		return (error);
1640 	}
1641 
1642 	switch (filter->rr_subject_type) {
1643 	case RCTL_SUBJECT_TYPE_PROCESS:
1644 		p = filter->rr_subject.rs_proc;
1645 		if (p == NULL) {
1646 			error = EINVAL;
1647 			goto out;
1648 		}
1649 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1650 		break;
1651 	case RCTL_SUBJECT_TYPE_USER:
1652 		uip = filter->rr_subject.rs_uip;
1653 		if (uip == NULL) {
1654 			error = EINVAL;
1655 			goto out;
1656 		}
1657 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1658 		break;
1659 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1660 		lc = filter->rr_subject.rs_loginclass;
1661 		if (lc == NULL) {
1662 			error = EINVAL;
1663 			goto out;
1664 		}
1665 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1666 		break;
1667 	case RCTL_SUBJECT_TYPE_JAIL:
1668 		prr = filter->rr_subject.rs_prison_racct;
1669 		if (prr == NULL) {
1670 			error = EINVAL;
1671 			goto out;
1672 		}
1673 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1674 		break;
1675 	default:
1676 		error = EINVAL;
1677 	}
1678 out:
1679 	rctl_rule_release(filter);
1680 	sx_sunlock(&allproc_lock);
1681 	if (error != 0)
1682 		return (error);
1683 
1684 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1685 
1686 	return (error);
1687 }
1688 
1689 static void
1690 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1691 {
1692 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1693 	struct rctl_rule_link *link;
1694 	struct sbuf *sb = (struct sbuf *)arg3;
1695 
1696 	ASSERT_RACCT_ENABLED();
1697 	RACCT_LOCK_ASSERT();
1698 
1699 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1700 		if (!rctl_rule_matches(link->rrl_rule, filter))
1701 			continue;
1702 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1703 		sbuf_printf(sb, ",");
1704 	}
1705 }
1706 
1707 int
1708 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1709 {
1710 	struct sbuf *sb;
1711 	struct rctl_rule *filter;
1712 	struct rctl_rule_link *link;
1713 	struct proc *p;
1714 	char *inputstr, *buf;
1715 	size_t bufsize;
1716 	int error;
1717 
1718 	if (!racct_enable)
1719 		return (ENOSYS);
1720 
1721 	error = priv_check(td, PRIV_RCTL_GET_RULES);
1722 	if (error != 0)
1723 		return (error);
1724 
1725 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1726 	if (error != 0)
1727 		return (error);
1728 
1729 	sx_slock(&allproc_lock);
1730 	error = rctl_string_to_rule(inputstr, &filter);
1731 	free(inputstr, M_RCTL);
1732 	if (error != 0) {
1733 		sx_sunlock(&allproc_lock);
1734 		return (error);
1735 	}
1736 
1737 	bufsize = uap->outbuflen;
1738 	if (bufsize > rctl_maxbufsize) {
1739 		sx_sunlock(&allproc_lock);
1740 		return (E2BIG);
1741 	}
1742 
1743 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1744 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1745 	KASSERT(sb != NULL, ("sbuf_new failed"));
1746 
1747 	FOREACH_PROC_IN_SYSTEM(p) {
1748 		RACCT_LOCK();
1749 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1750 			/*
1751 			 * Non-process rules will be added to the buffer later.
1752 			 * Adding them here would result in duplicated output.
1753 			 */
1754 			if (link->rrl_rule->rr_subject_type !=
1755 			    RCTL_SUBJECT_TYPE_PROCESS)
1756 				continue;
1757 			if (!rctl_rule_matches(link->rrl_rule, filter))
1758 				continue;
1759 			rctl_rule_to_sbuf(sb, link->rrl_rule);
1760 			sbuf_printf(sb, ",");
1761 		}
1762 		RACCT_UNLOCK();
1763 	}
1764 
1765 	loginclass_racct_foreach(rctl_get_rules_callback,
1766 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1767 	    filter, sb);
1768 	ui_racct_foreach(rctl_get_rules_callback,
1769 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1770 	    filter, sb);
1771 	prison_racct_foreach(rctl_get_rules_callback,
1772 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1773 	    filter, sb);
1774 	if (sbuf_error(sb) == ENOMEM) {
1775 		error = ERANGE;
1776 		goto out;
1777 	}
1778 
1779 	/*
1780 	 * Remove trailing ",".
1781 	 */
1782 	if (sbuf_len(sb) > 0)
1783 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1784 
1785 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1786 out:
1787 	rctl_rule_release(filter);
1788 	sx_sunlock(&allproc_lock);
1789 	free(buf, M_RCTL);
1790 	return (error);
1791 }
1792 
1793 int
1794 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1795 {
1796 	struct sbuf *sb;
1797 	struct rctl_rule *filter;
1798 	struct rctl_rule_link *link;
1799 	char *inputstr, *buf;
1800 	size_t bufsize;
1801 	int error;
1802 
1803 	if (!racct_enable)
1804 		return (ENOSYS);
1805 
1806 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1807 	if (error != 0)
1808 		return (error);
1809 
1810 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1811 	if (error != 0)
1812 		return (error);
1813 
1814 	sx_slock(&allproc_lock);
1815 	error = rctl_string_to_rule(inputstr, &filter);
1816 	free(inputstr, M_RCTL);
1817 	if (error != 0) {
1818 		sx_sunlock(&allproc_lock);
1819 		return (error);
1820 	}
1821 
1822 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1823 		rctl_rule_release(filter);
1824 		sx_sunlock(&allproc_lock);
1825 		return (EINVAL);
1826 	}
1827 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1828 		rctl_rule_release(filter);
1829 		sx_sunlock(&allproc_lock);
1830 		return (EOPNOTSUPP);
1831 	}
1832 	if (filter->rr_subject.rs_proc == NULL) {
1833 		rctl_rule_release(filter);
1834 		sx_sunlock(&allproc_lock);
1835 		return (EINVAL);
1836 	}
1837 
1838 	bufsize = uap->outbuflen;
1839 	if (bufsize > rctl_maxbufsize) {
1840 		rctl_rule_release(filter);
1841 		sx_sunlock(&allproc_lock);
1842 		return (E2BIG);
1843 	}
1844 
1845 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1846 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1847 	KASSERT(sb != NULL, ("sbuf_new failed"));
1848 
1849 	RACCT_LOCK();
1850 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1851 	    rrl_next) {
1852 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1853 		sbuf_printf(sb, ",");
1854 	}
1855 	RACCT_UNLOCK();
1856 	if (sbuf_error(sb) == ENOMEM) {
1857 		error = ERANGE;
1858 		sbuf_delete(sb);
1859 		goto out;
1860 	}
1861 
1862 	/*
1863 	 * Remove trailing ",".
1864 	 */
1865 	if (sbuf_len(sb) > 0)
1866 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1867 
1868 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1869 out:
1870 	rctl_rule_release(filter);
1871 	sx_sunlock(&allproc_lock);
1872 	free(buf, M_RCTL);
1873 	return (error);
1874 }
1875 
1876 int
1877 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1878 {
1879 	struct rctl_rule *rule;
1880 	char *inputstr;
1881 	int error;
1882 
1883 	if (!racct_enable)
1884 		return (ENOSYS);
1885 
1886 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1887 	if (error != 0)
1888 		return (error);
1889 
1890 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1891 	if (error != 0)
1892 		return (error);
1893 
1894 	sx_slock(&allproc_lock);
1895 	error = rctl_string_to_rule(inputstr, &rule);
1896 	free(inputstr, M_RCTL);
1897 	if (error != 0) {
1898 		sx_sunlock(&allproc_lock);
1899 		return (error);
1900 	}
1901 	/*
1902 	 * The 'per' part of a rule is optional.
1903 	 */
1904 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1905 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1906 		rule->rr_per = rule->rr_subject_type;
1907 
1908 	if (!rctl_rule_fully_specified(rule)) {
1909 		error = EINVAL;
1910 		goto out;
1911 	}
1912 
1913 	error = rctl_rule_add(rule);
1914 
1915 out:
1916 	rctl_rule_release(rule);
1917 	sx_sunlock(&allproc_lock);
1918 	return (error);
1919 }
1920 
1921 int
1922 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1923 {
1924 	struct rctl_rule *filter;
1925 	char *inputstr;
1926 	int error;
1927 
1928 	if (!racct_enable)
1929 		return (ENOSYS);
1930 
1931 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1932 	if (error != 0)
1933 		return (error);
1934 
1935 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1936 	if (error != 0)
1937 		return (error);
1938 
1939 	sx_slock(&allproc_lock);
1940 	error = rctl_string_to_rule(inputstr, &filter);
1941 	free(inputstr, M_RCTL);
1942 	if (error != 0) {
1943 		sx_sunlock(&allproc_lock);
1944 		return (error);
1945 	}
1946 
1947 	error = rctl_rule_remove(filter);
1948 	rctl_rule_release(filter);
1949 	sx_sunlock(&allproc_lock);
1950 
1951 	return (error);
1952 }
1953 
1954 /*
1955  * Update RCTL rule list after credential change.
1956  */
1957 void
1958 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1959 {
1960 	LIST_HEAD(, rctl_rule_link) newrules;
1961 	struct rctl_rule_link *link, *newlink;
1962 	struct uidinfo *newuip;
1963 	struct loginclass *newlc;
1964 	struct prison_racct *newprr;
1965 	int rulecnt, i;
1966 
1967 	if (!racct_enable)
1968 		return;
1969 
1970 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1971 
1972 	newuip = newcred->cr_ruidinfo;
1973 	newlc = newcred->cr_loginclass;
1974 	newprr = newcred->cr_prison->pr_prison_racct;
1975 
1976 	LIST_INIT(&newrules);
1977 
1978 again:
1979 	/*
1980 	 * First, count the rules that apply to the process with new
1981 	 * credentials.
1982 	 */
1983 	rulecnt = 0;
1984 	RACCT_LOCK();
1985 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1986 		if (link->rrl_rule->rr_subject_type ==
1987 		    RCTL_SUBJECT_TYPE_PROCESS)
1988 			rulecnt++;
1989 	}
1990 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1991 		rulecnt++;
1992 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1993 		rulecnt++;
1994 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1995 		rulecnt++;
1996 	RACCT_UNLOCK();
1997 
1998 	/*
1999 	 * Create temporary list.  We've dropped the rctl_lock in order
2000 	 * to use M_WAITOK.
2001 	 */
2002 	for (i = 0; i < rulecnt; i++) {
2003 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2004 		newlink->rrl_rule = NULL;
2005 		newlink->rrl_exceeded = 0;
2006 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2007 	}
2008 
2009 	newlink = LIST_FIRST(&newrules);
2010 
2011 	/*
2012 	 * Assign rules to the newly allocated list entries.
2013 	 */
2014 	RACCT_LOCK();
2015 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2016 		if (link->rrl_rule->rr_subject_type ==
2017 		    RCTL_SUBJECT_TYPE_PROCESS) {
2018 			if (newlink == NULL)
2019 				goto goaround;
2020 			rctl_rule_acquire(link->rrl_rule);
2021 			newlink->rrl_rule = link->rrl_rule;
2022 			newlink->rrl_exceeded = link->rrl_exceeded;
2023 			newlink = LIST_NEXT(newlink, rrl_next);
2024 			rulecnt--;
2025 		}
2026 	}
2027 
2028 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2029 		if (newlink == NULL)
2030 			goto goaround;
2031 		rctl_rule_acquire(link->rrl_rule);
2032 		newlink->rrl_rule = link->rrl_rule;
2033 		newlink->rrl_exceeded = link->rrl_exceeded;
2034 		newlink = LIST_NEXT(newlink, rrl_next);
2035 		rulecnt--;
2036 	}
2037 
2038 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2039 		if (newlink == NULL)
2040 			goto goaround;
2041 		rctl_rule_acquire(link->rrl_rule);
2042 		newlink->rrl_rule = link->rrl_rule;
2043 		newlink->rrl_exceeded = link->rrl_exceeded;
2044 		newlink = LIST_NEXT(newlink, rrl_next);
2045 		rulecnt--;
2046 	}
2047 
2048 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2049 		if (newlink == NULL)
2050 			goto goaround;
2051 		rctl_rule_acquire(link->rrl_rule);
2052 		newlink->rrl_rule = link->rrl_rule;
2053 		newlink->rrl_exceeded = link->rrl_exceeded;
2054 		newlink = LIST_NEXT(newlink, rrl_next);
2055 		rulecnt--;
2056 	}
2057 
2058 	if (rulecnt == 0) {
2059 		/*
2060 		 * Free the old rule list.
2061 		 */
2062 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2063 			link = LIST_FIRST(&p->p_racct->r_rule_links);
2064 			LIST_REMOVE(link, rrl_next);
2065 			rctl_rule_release(link->rrl_rule);
2066 			uma_zfree(rctl_rule_link_zone, link);
2067 		}
2068 
2069 		/*
2070 		 * Replace lists and we're done.
2071 		 *
2072 		 * XXX: Is there any way to switch list heads instead
2073 		 *      of iterating here?
2074 		 */
2075 		while (!LIST_EMPTY(&newrules)) {
2076 			newlink = LIST_FIRST(&newrules);
2077 			LIST_REMOVE(newlink, rrl_next);
2078 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2079 			    newlink, rrl_next);
2080 		}
2081 
2082 		RACCT_UNLOCK();
2083 
2084 		return;
2085 	}
2086 
2087 goaround:
2088 	RACCT_UNLOCK();
2089 
2090 	/*
2091 	 * Rule list changed while we were not holding the rctl_lock.
2092 	 * Free the new list and try again.
2093 	 */
2094 	while (!LIST_EMPTY(&newrules)) {
2095 		newlink = LIST_FIRST(&newrules);
2096 		LIST_REMOVE(newlink, rrl_next);
2097 		if (newlink->rrl_rule != NULL)
2098 			rctl_rule_release(newlink->rrl_rule);
2099 		uma_zfree(rctl_rule_link_zone, newlink);
2100 	}
2101 
2102 	goto again;
2103 }
2104 
2105 /*
2106  * Assign RCTL rules to the newly created process.
2107  */
2108 int
2109 rctl_proc_fork(struct proc *parent, struct proc *child)
2110 {
2111 	struct rctl_rule *rule;
2112 	struct rctl_rule_link *link;
2113 	int error;
2114 
2115 	ASSERT_RACCT_ENABLED();
2116 	RACCT_LOCK_ASSERT();
2117 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2118 
2119 	LIST_INIT(&child->p_racct->r_rule_links);
2120 
2121 	/*
2122 	 * Go through limits applicable to the parent and assign them
2123 	 * to the child.  Rules with 'process' subject have to be duplicated
2124 	 * in order to make their rr_subject point to the new process.
2125 	 */
2126 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2127 		if (link->rrl_rule->rr_subject_type ==
2128 		    RCTL_SUBJECT_TYPE_PROCESS) {
2129 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2130 			if (rule == NULL)
2131 				goto fail;
2132 			KASSERT(rule->rr_subject.rs_proc == parent,
2133 			    ("rule->rr_subject.rs_proc != parent"));
2134 			rule->rr_subject.rs_proc = child;
2135 			error = rctl_racct_add_rule_locked(child->p_racct,
2136 			    rule);
2137 			rctl_rule_release(rule);
2138 			if (error != 0)
2139 				goto fail;
2140 		} else {
2141 			error = rctl_racct_add_rule_locked(child->p_racct,
2142 			    link->rrl_rule);
2143 			if (error != 0)
2144 				goto fail;
2145 		}
2146 	}
2147 
2148 	return (0);
2149 
2150 fail:
2151 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2152 		link = LIST_FIRST(&child->p_racct->r_rule_links);
2153 		LIST_REMOVE(link, rrl_next);
2154 		rctl_rule_release(link->rrl_rule);
2155 		uma_zfree(rctl_rule_link_zone, link);
2156 	}
2157 
2158 	return (EAGAIN);
2159 }
2160 
2161 /*
2162  * Release rules attached to the racct.
2163  */
2164 void
2165 rctl_racct_release(struct racct *racct)
2166 {
2167 	struct rctl_rule_link *link;
2168 
2169 	ASSERT_RACCT_ENABLED();
2170 	RACCT_LOCK_ASSERT();
2171 
2172 	while (!LIST_EMPTY(&racct->r_rule_links)) {
2173 		link = LIST_FIRST(&racct->r_rule_links);
2174 		LIST_REMOVE(link, rrl_next);
2175 		rctl_rule_release(link->rrl_rule);
2176 		uma_zfree(rctl_rule_link_zone, link);
2177 	}
2178 }
2179 
2180 static void
2181 rctl_init(void)
2182 {
2183 
2184 	if (!racct_enable)
2185 		return;
2186 
2187 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2188 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2189 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2190 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2191 	    UMA_ALIGN_PTR, 0);
2192 
2193 	/*
2194 	 * Set default values, making sure not to overwrite the ones
2195 	 * fetched from tunables.  Most of those could be set at the
2196 	 * declaration, except for the rctl_throttle_max - we cannot
2197 	 * set it there due to hz not being compile time constant.
2198 	 */
2199 	if (rctl_throttle_min < 1)
2200 		rctl_throttle_min = 1;
2201 	if (rctl_throttle_max < rctl_throttle_min)
2202 		rctl_throttle_max = 2 * hz;
2203 	if (rctl_throttle_pct < 0)
2204 		rctl_throttle_pct = 100;
2205 	if (rctl_throttle_pct2 < 0)
2206 		rctl_throttle_pct2 = 100;
2207 }
2208 
2209 #else /* !RCTL */
2210 
2211 int
2212 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2213 {
2214 
2215 	return (ENOSYS);
2216 }
2217 
2218 int
2219 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2220 {
2221 
2222 	return (ENOSYS);
2223 }
2224 
2225 int
2226 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2227 {
2228 
2229 	return (ENOSYS);
2230 }
2231 
2232 int
2233 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2234 {
2235 
2236 	return (ENOSYS);
2237 }
2238 
2239 int
2240 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2241 {
2242 
2243 	return (ENOSYS);
2244 }
2245 
2246 #endif /* !RCTL */
2247