xref: /freebsd/sys/kern/kern_rctl.c (revision d9f0ce31900a48d1a2bfc1c8c86f79d1e831451a)
1 /*-
2  * Copyright (c) 2010 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/malloc.h>
38 #include <sys/queue.h>
39 #include <sys/refcount.h>
40 #include <sys/jail.h>
41 #include <sys/kernel.h>
42 #include <sys/limits.h>
43 #include <sys/loginclass.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/racct.h>
47 #include <sys/rctl.h>
48 #include <sys/resourcevar.h>
49 #include <sys/sx.h>
50 #include <sys/sysent.h>
51 #include <sys/sysproto.h>
52 #include <sys/systm.h>
53 #include <sys/types.h>
54 #include <sys/eventhandler.h>
55 #include <sys/lock.h>
56 #include <sys/mutex.h>
57 #include <sys/rwlock.h>
58 #include <sys/sbuf.h>
59 #include <sys/taskqueue.h>
60 #include <sys/tree.h>
61 #include <vm/uma.h>
62 
63 #ifdef RCTL
64 #ifndef RACCT
65 #error "The RCTL option requires the RACCT option"
66 #endif
67 
68 FEATURE(rctl, "Resource Limits");
69 
70 #define	HRF_DEFAULT		0
71 #define	HRF_DONT_INHERIT	1
72 #define	HRF_DONT_ACCUMULATE	2
73 
74 #define	RCTL_MAX_INBUFSIZE	4 * 1024
75 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
76 #define	RCTL_LOG_BUFSIZE	128
77 
78 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
79 
80 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81 static int rctl_log_rate_limit = 10;
82 static int rctl_devctl_rate_limit = 10;
83 
84 /*
85  * Values below are initialized in rctl_init().
86  */
87 static int rctl_throttle_min = -1;
88 static int rctl_throttle_max = -1;
89 static int rctl_throttle_pct = -1;
90 static int rctl_throttle_pct2 = -1;
91 
92 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96 
97 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
98 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
99     &rctl_maxbufsize, 0, "Maximum output buffer size");
100 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
101     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
102 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
103     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
104 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
105     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
106     "Shortest throttling duration, in hz");
107 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
108 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
109     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
110     "Longest throttling duration, in hz");
111 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
112 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
113     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
114     "Throttling penalty for process consumption, in percent");
115 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
116 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
117     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
118     "Throttling penalty for container consumption, in percent");
119 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
120 
121 /*
122  * 'rctl_rule_link' connects a rule with every racct it's related to.
123  * For example, rule 'user:X:openfiles:deny=N/process' is linked
124  * with uidinfo for user X, and to each process of that user.
125  */
126 struct rctl_rule_link {
127 	LIST_ENTRY(rctl_rule_link)	rrl_next;
128 	struct rctl_rule		*rrl_rule;
129 	int				rrl_exceeded;
130 };
131 
132 struct dict {
133 	const char	*d_name;
134 	int		d_value;
135 };
136 
137 static struct dict subjectnames[] = {
138 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
139 	{ "user", RCTL_SUBJECT_TYPE_USER },
140 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
141 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
142 	{ NULL, -1 }};
143 
144 static struct dict resourcenames[] = {
145 	{ "cputime", RACCT_CPU },
146 	{ "datasize", RACCT_DATA },
147 	{ "stacksize", RACCT_STACK },
148 	{ "coredumpsize", RACCT_CORE },
149 	{ "memoryuse", RACCT_RSS },
150 	{ "memorylocked", RACCT_MEMLOCK },
151 	{ "maxproc", RACCT_NPROC },
152 	{ "openfiles", RACCT_NOFILE },
153 	{ "vmemoryuse", RACCT_VMEM },
154 	{ "pseudoterminals", RACCT_NPTS },
155 	{ "swapuse", RACCT_SWAP },
156 	{ "nthr", RACCT_NTHR },
157 	{ "msgqqueued", RACCT_MSGQQUEUED },
158 	{ "msgqsize", RACCT_MSGQSIZE },
159 	{ "nmsgq", RACCT_NMSGQ },
160 	{ "nsem", RACCT_NSEM },
161 	{ "nsemop", RACCT_NSEMOP },
162 	{ "nshm", RACCT_NSHM },
163 	{ "shmsize", RACCT_SHMSIZE },
164 	{ "wallclock", RACCT_WALLCLOCK },
165 	{ "pcpu", RACCT_PCTCPU },
166 	{ "readbps", RACCT_READBPS },
167 	{ "writebps", RACCT_WRITEBPS },
168 	{ "readiops", RACCT_READIOPS },
169 	{ "writeiops", RACCT_WRITEIOPS },
170 	{ NULL, -1 }};
171 
172 static struct dict actionnames[] = {
173 	{ "sighup", RCTL_ACTION_SIGHUP },
174 	{ "sigint", RCTL_ACTION_SIGINT },
175 	{ "sigquit", RCTL_ACTION_SIGQUIT },
176 	{ "sigill", RCTL_ACTION_SIGILL },
177 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
178 	{ "sigabrt", RCTL_ACTION_SIGABRT },
179 	{ "sigemt", RCTL_ACTION_SIGEMT },
180 	{ "sigfpe", RCTL_ACTION_SIGFPE },
181 	{ "sigkill", RCTL_ACTION_SIGKILL },
182 	{ "sigbus", RCTL_ACTION_SIGBUS },
183 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
184 	{ "sigsys", RCTL_ACTION_SIGSYS },
185 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
186 	{ "sigalrm", RCTL_ACTION_SIGALRM },
187 	{ "sigterm", RCTL_ACTION_SIGTERM },
188 	{ "sigurg", RCTL_ACTION_SIGURG },
189 	{ "sigstop", RCTL_ACTION_SIGSTOP },
190 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
191 	{ "sigchld", RCTL_ACTION_SIGCHLD },
192 	{ "sigttin", RCTL_ACTION_SIGTTIN },
193 	{ "sigttou", RCTL_ACTION_SIGTTOU },
194 	{ "sigio", RCTL_ACTION_SIGIO },
195 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
196 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
197 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
198 	{ "sigprof", RCTL_ACTION_SIGPROF },
199 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
200 	{ "siginfo", RCTL_ACTION_SIGINFO },
201 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
202 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
203 	{ "sigthr", RCTL_ACTION_SIGTHR },
204 	{ "deny", RCTL_ACTION_DENY },
205 	{ "log", RCTL_ACTION_LOG },
206 	{ "devctl", RCTL_ACTION_DEVCTL },
207 	{ "throttle", RCTL_ACTION_THROTTLE },
208 	{ NULL, -1 }};
209 
210 static void rctl_init(void);
211 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
212 
213 static uma_zone_t rctl_rule_zone;
214 static uma_zone_t rctl_rule_link_zone;
215 static struct rwlock rctl_lock;
216 RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
217 
218 #define RCTL_RLOCK()		rw_rlock(&rctl_lock)
219 #define RCTL_RUNLOCK()		rw_runlock(&rctl_lock)
220 #define RCTL_WLOCK()		rw_wlock(&rctl_lock)
221 #define RCTL_WUNLOCK()		rw_wunlock(&rctl_lock)
222 #define RCTL_LOCK_ASSERT()	rw_assert(&rctl_lock, RA_LOCKED)
223 #define RCTL_WLOCK_ASSERT()	rw_assert(&rctl_lock, RA_WLOCKED)
224 
225 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
226 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
227 
228 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
229 
230 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
231 {
232 	int error, val = rctl_throttle_min;
233 
234 	error = sysctl_handle_int(oidp, &val, 0, req);
235 	if (error || !req->newptr)
236 		return (error);
237 	if (val < 1 || val > rctl_throttle_max)
238 		return (EINVAL);
239 
240 	RCTL_WLOCK();
241 	rctl_throttle_min = val;
242 	RCTL_WUNLOCK();
243 
244 	return (0);
245 }
246 
247 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
248 {
249 	int error, val = rctl_throttle_max;
250 
251 	error = sysctl_handle_int(oidp, &val, 0, req);
252 	if (error || !req->newptr)
253 		return (error);
254 	if (val < rctl_throttle_min)
255 		return (EINVAL);
256 
257 	RCTL_WLOCK();
258 	rctl_throttle_max = val;
259 	RCTL_WUNLOCK();
260 
261 	return (0);
262 }
263 
264 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
265 {
266 	int error, val = rctl_throttle_pct;
267 
268 	error = sysctl_handle_int(oidp, &val, 0, req);
269 	if (error || !req->newptr)
270 		return (error);
271 	if (val < 0)
272 		return (EINVAL);
273 
274 	RCTL_WLOCK();
275 	rctl_throttle_pct = val;
276 	RCTL_WUNLOCK();
277 
278 	return (0);
279 }
280 
281 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
282 {
283 	int error, val = rctl_throttle_pct2;
284 
285 	error = sysctl_handle_int(oidp, &val, 0, req);
286 	if (error || !req->newptr)
287 		return (error);
288 	if (val < 0)
289 		return (EINVAL);
290 
291 	RCTL_WLOCK();
292 	rctl_throttle_pct2 = val;
293 	RCTL_WUNLOCK();
294 
295 	return (0);
296 }
297 
298 static const char *
299 rctl_subject_type_name(int subject)
300 {
301 	int i;
302 
303 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
304 		if (subjectnames[i].d_value == subject)
305 			return (subjectnames[i].d_name);
306 	}
307 
308 	panic("rctl_subject_type_name: unknown subject type %d", subject);
309 }
310 
311 static const char *
312 rctl_action_name(int action)
313 {
314 	int i;
315 
316 	for (i = 0; actionnames[i].d_name != NULL; i++) {
317 		if (actionnames[i].d_value == action)
318 			return (actionnames[i].d_name);
319 	}
320 
321 	panic("rctl_action_name: unknown action %d", action);
322 }
323 
324 const char *
325 rctl_resource_name(int resource)
326 {
327 	int i;
328 
329 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
330 		if (resourcenames[i].d_value == resource)
331 			return (resourcenames[i].d_name);
332 	}
333 
334 	panic("rctl_resource_name: unknown resource %d", resource);
335 }
336 
337 static struct racct *
338 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
339 {
340 	struct ucred *cred = p->p_ucred;
341 
342 	ASSERT_RACCT_ENABLED();
343 	RCTL_LOCK_ASSERT();
344 
345 	switch (rule->rr_per) {
346 	case RCTL_SUBJECT_TYPE_PROCESS:
347 		return (p->p_racct);
348 	case RCTL_SUBJECT_TYPE_USER:
349 		return (cred->cr_ruidinfo->ui_racct);
350 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
351 		return (cred->cr_loginclass->lc_racct);
352 	case RCTL_SUBJECT_TYPE_JAIL:
353 		return (cred->cr_prison->pr_prison_racct->prr_racct);
354 	default:
355 		panic("%s: unknown per %d", __func__, rule->rr_per);
356 	}
357 }
358 
359 /*
360  * Return the amount of resource that can be allocated by 'p' before
361  * hitting 'rule'.
362  */
363 static int64_t
364 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
365 {
366 	const struct racct *racct;
367 	int64_t available;
368 
369 	ASSERT_RACCT_ENABLED();
370 	RCTL_LOCK_ASSERT();
371 
372 	racct = rctl_proc_rule_to_racct(p, rule);
373 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
374 
375 	return (available);
376 }
377 
378 /*
379  * Called every second for proc, uidinfo, loginclass, and jail containers.
380  * If the limit isn't exceeded, it decreases the usage amount to zero.
381  * Otherwise, it decreases it by the value of the limit.  This way
382  * resource consumption exceeding the limit "carries over" to the next
383  * period.
384  */
385 void
386 rctl_throttle_decay(struct racct *racct, int resource)
387 {
388 	struct rctl_rule *rule;
389 	struct rctl_rule_link *link;
390 	int64_t minavailable;
391 
392 	ASSERT_RACCT_ENABLED();
393 
394 	minavailable = INT64_MAX;
395 
396 	RCTL_RLOCK();
397 
398 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
399 		rule = link->rrl_rule;
400 
401 		if (rule->rr_resource != resource)
402 			continue;
403 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
404 			continue;
405 
406 		if (rule->rr_amount < minavailable)
407 			minavailable = rule->rr_amount;
408 	}
409 
410 	RCTL_RUNLOCK();
411 
412 	if (racct->r_resources[resource] < minavailable) {
413 		racct->r_resources[resource] = 0;
414 	} else {
415 		/*
416 		 * Cap utilization counter at ten times the limit.  Otherwise,
417 		 * if we changed the rule lowering the allowed amount, it could
418 		 * take unreasonably long time for the accumulated resource
419 		 * usage to drop.
420 		 */
421 		if (racct->r_resources[resource] > minavailable * 10)
422 			racct->r_resources[resource] = minavailable * 10;
423 
424 		racct->r_resources[resource] -= minavailable;
425 	}
426 }
427 
428 /*
429  * Special version of rctl_get_available() for the %CPU resource.
430  * We slightly cheat here and return less than we normally would.
431  */
432 int64_t
433 rctl_pcpu_available(const struct proc *p) {
434 	struct rctl_rule *rule;
435 	struct rctl_rule_link *link;
436 	int64_t available, minavailable, limit;
437 
438 	ASSERT_RACCT_ENABLED();
439 
440 	minavailable = INT64_MAX;
441 	limit = 0;
442 
443 	RCTL_RLOCK();
444 
445 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
446 		rule = link->rrl_rule;
447 		if (rule->rr_resource != RACCT_PCTCPU)
448 			continue;
449 		if (rule->rr_action != RCTL_ACTION_DENY)
450 			continue;
451 		available = rctl_available_resource(p, rule);
452 		if (available < minavailable) {
453 			minavailable = available;
454 			limit = rule->rr_amount;
455 		}
456 	}
457 
458 	RCTL_RUNLOCK();
459 
460 	/*
461 	 * Return slightly less than actual value of the available
462 	 * %cpu resource.  This makes %cpu throttling more agressive
463 	 * and lets us act sooner than the limits are already exceeded.
464 	 */
465 	if (limit != 0) {
466 		if (limit > 2 * RCTL_PCPU_SHIFT)
467 			minavailable -= RCTL_PCPU_SHIFT;
468 		else
469 			minavailable -= (limit / 2);
470 	}
471 
472 	return (minavailable);
473 }
474 
475 static uint64_t
476 xadd(uint64_t a, uint64_t b)
477 {
478 	uint64_t c;
479 
480 	c = a + b;
481 
482 	/*
483 	 * Detect overflow.
484 	 */
485 	if (c < a || c < b)
486 		return (UINT64_MAX);
487 
488 	return (c);
489 }
490 
491 static uint64_t
492 xmul(uint64_t a, uint64_t b)
493 {
494 
495 	if (b != 0 && a > UINT64_MAX / b)
496 		return (UINT64_MAX);
497 
498 	return (a * b);
499 }
500 
501 /*
502  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
503  * to what it keeps allocated now.  Returns non-zero if the allocation should
504  * be denied, 0 otherwise.
505  */
506 int
507 rctl_enforce(struct proc *p, int resource, uint64_t amount)
508 {
509 	static struct timeval log_lasttime, devctl_lasttime;
510 	static int log_curtime = 0, devctl_curtime = 0;
511 	struct rctl_rule *rule;
512 	struct rctl_rule_link *link;
513 	struct sbuf sb;
514 	char *buf;
515 	int64_t available;
516 	uint64_t sleep_ms, sleep_ratio;
517 	int should_deny = 0;
518 
519 
520 	ASSERT_RACCT_ENABLED();
521 
522 	RCTL_RLOCK();
523 
524 	/*
525 	 * There may be more than one matching rule; go through all of them.
526 	 * Denial should be done last, after logging and sending signals.
527 	 */
528 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
529 		rule = link->rrl_rule;
530 		if (rule->rr_resource != resource)
531 			continue;
532 
533 		available = rctl_available_resource(p, rule);
534 		if (available >= (int64_t)amount) {
535 			link->rrl_exceeded = 0;
536 			continue;
537 		}
538 
539 		switch (rule->rr_action) {
540 		case RCTL_ACTION_DENY:
541 			should_deny = 1;
542 			continue;
543 		case RCTL_ACTION_LOG:
544 			/*
545 			 * If rrl_exceeded != 0, it means we've already
546 			 * logged a warning for this process.
547 			 */
548 			if (link->rrl_exceeded != 0)
549 				continue;
550 
551 			/*
552 			 * If the process state is not fully initialized yet,
553 			 * we can't access most of the required fields, e.g.
554 			 * p->p_comm.  This happens when called from fork1().
555 			 * Ignore this rule for now; it will be processed just
556 			 * after fork, when called from racct_proc_fork_done().
557 			 */
558 			if (p->p_state != PRS_NORMAL)
559 				continue;
560 
561 			if (!ppsratecheck(&log_lasttime, &log_curtime,
562 			    rctl_log_rate_limit))
563 				continue;
564 
565 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
566 			if (buf == NULL) {
567 				printf("rctl_enforce: out of memory\n");
568 				continue;
569 			}
570 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
571 			rctl_rule_to_sbuf(&sb, rule);
572 			sbuf_finish(&sb);
573 			printf("rctl: rule \"%s\" matched by pid %d "
574 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
575 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
576 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
577 			sbuf_delete(&sb);
578 			free(buf, M_RCTL);
579 			link->rrl_exceeded = 1;
580 			continue;
581 		case RCTL_ACTION_DEVCTL:
582 			if (link->rrl_exceeded != 0)
583 				continue;
584 
585 			if (p->p_state != PRS_NORMAL)
586 				continue;
587 
588 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
589 			    rctl_devctl_rate_limit))
590 				continue;
591 
592 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
593 			if (buf == NULL) {
594 				printf("rctl_enforce: out of memory\n");
595 				continue;
596 			}
597 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
598 			sbuf_printf(&sb, "rule=");
599 			rctl_rule_to_sbuf(&sb, rule);
600 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
601 			    p->p_pid, p->p_ucred->cr_ruid,
602 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
603 			sbuf_finish(&sb);
604 			devctl_notify_f("RCTL", "rule", "matched",
605 			    sbuf_data(&sb), M_NOWAIT);
606 			sbuf_delete(&sb);
607 			free(buf, M_RCTL);
608 			link->rrl_exceeded = 1;
609 			continue;
610 		case RCTL_ACTION_THROTTLE:
611 			if (p->p_state != PRS_NORMAL)
612 				continue;
613 
614 			/*
615 			 * Make the process sleep for a fraction of second
616 			 * proportional to the ratio of process' resource
617 			 * utilization compared to the limit.  The point is
618 			 * to penalize resource hogs: processes that consume
619 			 * more of the available resources sleep for longer.
620 			 *
621 			 * We're trying to defer division until the very end,
622 			 * to minimize the rounding effects.  The following
623 			 * calculation could have been written in a clearer
624 			 * way like this:
625 			 *
626 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
627 			 *     rule->rr_amount;
628 			 * sleep_ms *= rctl_throttle_pct / 100;
629 			 * if (sleep_ms < rctl_throttle_min)
630 			 *         sleep_ms = rctl_throttle_min;
631 			 *
632 			 */
633 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
634 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
635 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
636 				sleep_ms = rctl_throttle_min * rule->rr_amount;
637 
638 			/*
639 			 * Multiply that by the ratio of the resource
640 			 * consumption for the container compared to the limit,
641 			 * squared.  In other words, a process in a container
642 			 * that is two times over the limit will be throttled
643 			 * four times as much for hitting the same rule.  The
644 			 * point is to penalize processes more if the container
645 			 * itself (eg certain UID or jail) is above the limit.
646 			 */
647 			if (available < 0)
648 				sleep_ratio = -available / rule->rr_amount;
649 			else
650 				sleep_ratio = 0;
651 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
652 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
653 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
654 
655 			/*
656 			 * Finally the division.
657 			 */
658 			sleep_ms /= rule->rr_amount;
659 
660 			if (sleep_ms > rctl_throttle_max)
661 				sleep_ms = rctl_throttle_max;
662 #if 0
663 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
664 			   __func__, p->p_pid, p->p_comm,
665 			   p->p_racct->r_resources[resource],
666 			   rule->rr_amount, sleep_ms, sleep_ratio, available);
667 #endif
668 
669 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
670 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
671 			racct_proc_throttle(p, sleep_ms);
672 			continue;
673 		default:
674 			if (link->rrl_exceeded != 0)
675 				continue;
676 
677 			if (p->p_state != PRS_NORMAL)
678 				continue;
679 
680 			KASSERT(rule->rr_action > 0 &&
681 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
682 			    ("rctl_enforce: unknown action %d",
683 			     rule->rr_action));
684 
685 			/*
686 			 * We're using the fact that RCTL_ACTION_SIG* values
687 			 * are equal to their counterparts from sys/signal.h.
688 			 */
689 			kern_psignal(p, rule->rr_action);
690 			link->rrl_exceeded = 1;
691 			continue;
692 		}
693 	}
694 
695 	RCTL_RUNLOCK();
696 
697 	if (should_deny) {
698 		/*
699 		 * Return fake error code; the caller should change it
700 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
701 		 */
702 		return (EDOOFUS);
703 	}
704 
705 	return (0);
706 }
707 
708 uint64_t
709 rctl_get_limit(struct proc *p, int resource)
710 {
711 	struct rctl_rule *rule;
712 	struct rctl_rule_link *link;
713 	uint64_t amount = UINT64_MAX;
714 
715 	ASSERT_RACCT_ENABLED();
716 
717 	RCTL_RLOCK();
718 
719 	/*
720 	 * There may be more than one matching rule; go through all of them.
721 	 * Denial should be done last, after logging and sending signals.
722 	 */
723 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
724 		rule = link->rrl_rule;
725 		if (rule->rr_resource != resource)
726 			continue;
727 		if (rule->rr_action != RCTL_ACTION_DENY)
728 			continue;
729 		if (rule->rr_amount < amount)
730 			amount = rule->rr_amount;
731 	}
732 
733 	RCTL_RUNLOCK();
734 
735 	return (amount);
736 }
737 
738 uint64_t
739 rctl_get_available(struct proc *p, int resource)
740 {
741 	struct rctl_rule *rule;
742 	struct rctl_rule_link *link;
743 	int64_t available, minavailable, allocated;
744 
745 	minavailable = INT64_MAX;
746 
747 	ASSERT_RACCT_ENABLED();
748 
749 	RCTL_RLOCK();
750 
751 	/*
752 	 * There may be more than one matching rule; go through all of them.
753 	 * Denial should be done last, after logging and sending signals.
754 	 */
755 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
756 		rule = link->rrl_rule;
757 		if (rule->rr_resource != resource)
758 			continue;
759 		if (rule->rr_action != RCTL_ACTION_DENY)
760 			continue;
761 		available = rctl_available_resource(p, rule);
762 		if (available < minavailable)
763 			minavailable = available;
764 	}
765 
766 	RCTL_RUNLOCK();
767 
768 	/*
769 	 * XXX: Think about this _hard_.
770 	 */
771 	allocated = p->p_racct->r_resources[resource];
772 	if (minavailable < INT64_MAX - allocated)
773 		minavailable += allocated;
774 	if (minavailable < 0)
775 		minavailable = 0;
776 	return (minavailable);
777 }
778 
779 static int
780 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
781 {
782 
783 	ASSERT_RACCT_ENABLED();
784 
785 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
786 		if (rule->rr_subject_type != filter->rr_subject_type)
787 			return (0);
788 
789 		switch (filter->rr_subject_type) {
790 		case RCTL_SUBJECT_TYPE_PROCESS:
791 			if (filter->rr_subject.rs_proc != NULL &&
792 			    rule->rr_subject.rs_proc !=
793 			    filter->rr_subject.rs_proc)
794 				return (0);
795 			break;
796 		case RCTL_SUBJECT_TYPE_USER:
797 			if (filter->rr_subject.rs_uip != NULL &&
798 			    rule->rr_subject.rs_uip !=
799 			    filter->rr_subject.rs_uip)
800 				return (0);
801 			break;
802 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
803 			if (filter->rr_subject.rs_loginclass != NULL &&
804 			    rule->rr_subject.rs_loginclass !=
805 			    filter->rr_subject.rs_loginclass)
806 				return (0);
807 			break;
808 		case RCTL_SUBJECT_TYPE_JAIL:
809 			if (filter->rr_subject.rs_prison_racct != NULL &&
810 			    rule->rr_subject.rs_prison_racct !=
811 			    filter->rr_subject.rs_prison_racct)
812 				return (0);
813 			break;
814 		default:
815 			panic("rctl_rule_matches: unknown subject type %d",
816 			    filter->rr_subject_type);
817 		}
818 	}
819 
820 	if (filter->rr_resource != RACCT_UNDEFINED) {
821 		if (rule->rr_resource != filter->rr_resource)
822 			return (0);
823 	}
824 
825 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
826 		if (rule->rr_action != filter->rr_action)
827 			return (0);
828 	}
829 
830 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
831 		if (rule->rr_amount != filter->rr_amount)
832 			return (0);
833 	}
834 
835 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
836 		if (rule->rr_per != filter->rr_per)
837 			return (0);
838 	}
839 
840 	return (1);
841 }
842 
843 static int
844 str2value(const char *str, int *value, struct dict *table)
845 {
846 	int i;
847 
848 	if (value == NULL)
849 		return (EINVAL);
850 
851 	for (i = 0; table[i].d_name != NULL; i++) {
852 		if (strcasecmp(table[i].d_name, str) == 0) {
853 			*value =  table[i].d_value;
854 			return (0);
855 		}
856 	}
857 
858 	return (EINVAL);
859 }
860 
861 static int
862 str2id(const char *str, id_t *value)
863 {
864 	char *end;
865 
866 	if (str == NULL)
867 		return (EINVAL);
868 
869 	*value = strtoul(str, &end, 10);
870 	if ((size_t)(end - str) != strlen(str))
871 		return (EINVAL);
872 
873 	return (0);
874 }
875 
876 static int
877 str2int64(const char *str, int64_t *value)
878 {
879 	char *end;
880 
881 	if (str == NULL)
882 		return (EINVAL);
883 
884 	*value = strtoul(str, &end, 10);
885 	if ((size_t)(end - str) != strlen(str))
886 		return (EINVAL);
887 
888 	if (*value < 0)
889 		return (ERANGE);
890 
891 	return (0);
892 }
893 
894 /*
895  * Connect the rule to the racct, increasing refcount for the rule.
896  */
897 static void
898 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
899 {
900 	struct rctl_rule_link *link;
901 
902 	ASSERT_RACCT_ENABLED();
903 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
904 
905 	rctl_rule_acquire(rule);
906 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
907 	link->rrl_rule = rule;
908 	link->rrl_exceeded = 0;
909 
910 	RCTL_WLOCK();
911 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
912 	RCTL_WUNLOCK();
913 }
914 
915 static int
916 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
917 {
918 	struct rctl_rule_link *link;
919 
920 	ASSERT_RACCT_ENABLED();
921 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
922 	RCTL_WLOCK_ASSERT();
923 
924 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
925 	if (link == NULL)
926 		return (ENOMEM);
927 	rctl_rule_acquire(rule);
928 	link->rrl_rule = rule;
929 	link->rrl_exceeded = 0;
930 
931 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
932 	return (0);
933 }
934 
935 /*
936  * Remove limits for a rules matching the filter and release
937  * the refcounts for the rules, possibly freeing them.  Returns
938  * the number of limit structures removed.
939  */
940 static int
941 rctl_racct_remove_rules(struct racct *racct,
942     const struct rctl_rule *filter)
943 {
944 	struct rctl_rule_link *link, *linktmp;
945 	int removed = 0;
946 
947 	ASSERT_RACCT_ENABLED();
948 	RCTL_WLOCK_ASSERT();
949 
950 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
951 		if (!rctl_rule_matches(link->rrl_rule, filter))
952 			continue;
953 
954 		LIST_REMOVE(link, rrl_next);
955 		rctl_rule_release(link->rrl_rule);
956 		uma_zfree(rctl_rule_link_zone, link);
957 		removed++;
958 	}
959 	return (removed);
960 }
961 
962 static void
963 rctl_rule_acquire_subject(struct rctl_rule *rule)
964 {
965 
966 	ASSERT_RACCT_ENABLED();
967 
968 	switch (rule->rr_subject_type) {
969 	case RCTL_SUBJECT_TYPE_UNDEFINED:
970 	case RCTL_SUBJECT_TYPE_PROCESS:
971 		break;
972 	case RCTL_SUBJECT_TYPE_JAIL:
973 		if (rule->rr_subject.rs_prison_racct != NULL)
974 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
975 		break;
976 	case RCTL_SUBJECT_TYPE_USER:
977 		if (rule->rr_subject.rs_uip != NULL)
978 			uihold(rule->rr_subject.rs_uip);
979 		break;
980 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
981 		if (rule->rr_subject.rs_loginclass != NULL)
982 			loginclass_hold(rule->rr_subject.rs_loginclass);
983 		break;
984 	default:
985 		panic("rctl_rule_acquire_subject: unknown subject type %d",
986 		    rule->rr_subject_type);
987 	}
988 }
989 
990 static void
991 rctl_rule_release_subject(struct rctl_rule *rule)
992 {
993 
994 	ASSERT_RACCT_ENABLED();
995 
996 	switch (rule->rr_subject_type) {
997 	case RCTL_SUBJECT_TYPE_UNDEFINED:
998 	case RCTL_SUBJECT_TYPE_PROCESS:
999 		break;
1000 	case RCTL_SUBJECT_TYPE_JAIL:
1001 		if (rule->rr_subject.rs_prison_racct != NULL)
1002 			prison_racct_free(rule->rr_subject.rs_prison_racct);
1003 		break;
1004 	case RCTL_SUBJECT_TYPE_USER:
1005 		if (rule->rr_subject.rs_uip != NULL)
1006 			uifree(rule->rr_subject.rs_uip);
1007 		break;
1008 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1009 		if (rule->rr_subject.rs_loginclass != NULL)
1010 			loginclass_free(rule->rr_subject.rs_loginclass);
1011 		break;
1012 	default:
1013 		panic("rctl_rule_release_subject: unknown subject type %d",
1014 		    rule->rr_subject_type);
1015 	}
1016 }
1017 
1018 struct rctl_rule *
1019 rctl_rule_alloc(int flags)
1020 {
1021 	struct rctl_rule *rule;
1022 
1023 	ASSERT_RACCT_ENABLED();
1024 
1025 	rule = uma_zalloc(rctl_rule_zone, flags);
1026 	if (rule == NULL)
1027 		return (NULL);
1028 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1029 	rule->rr_subject.rs_proc = NULL;
1030 	rule->rr_subject.rs_uip = NULL;
1031 	rule->rr_subject.rs_loginclass = NULL;
1032 	rule->rr_subject.rs_prison_racct = NULL;
1033 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1034 	rule->rr_resource = RACCT_UNDEFINED;
1035 	rule->rr_action = RCTL_ACTION_UNDEFINED;
1036 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1037 	refcount_init(&rule->rr_refcount, 1);
1038 
1039 	return (rule);
1040 }
1041 
1042 struct rctl_rule *
1043 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1044 {
1045 	struct rctl_rule *copy;
1046 
1047 	ASSERT_RACCT_ENABLED();
1048 
1049 	copy = uma_zalloc(rctl_rule_zone, flags);
1050 	if (copy == NULL)
1051 		return (NULL);
1052 	copy->rr_subject_type = rule->rr_subject_type;
1053 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1054 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1055 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1056 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1057 	copy->rr_per = rule->rr_per;
1058 	copy->rr_resource = rule->rr_resource;
1059 	copy->rr_action = rule->rr_action;
1060 	copy->rr_amount = rule->rr_amount;
1061 	refcount_init(&copy->rr_refcount, 1);
1062 	rctl_rule_acquire_subject(copy);
1063 
1064 	return (copy);
1065 }
1066 
1067 void
1068 rctl_rule_acquire(struct rctl_rule *rule)
1069 {
1070 
1071 	ASSERT_RACCT_ENABLED();
1072 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1073 
1074 	refcount_acquire(&rule->rr_refcount);
1075 }
1076 
1077 static void
1078 rctl_rule_free(void *context, int pending)
1079 {
1080 	struct rctl_rule *rule;
1081 
1082 	rule = (struct rctl_rule *)context;
1083 
1084 	ASSERT_RACCT_ENABLED();
1085 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1086 
1087 	/*
1088 	 * We don't need locking here; rule is guaranteed to be inaccessible.
1089 	 */
1090 
1091 	rctl_rule_release_subject(rule);
1092 	uma_zfree(rctl_rule_zone, rule);
1093 }
1094 
1095 void
1096 rctl_rule_release(struct rctl_rule *rule)
1097 {
1098 
1099 	ASSERT_RACCT_ENABLED();
1100 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1101 
1102 	if (refcount_release(&rule->rr_refcount)) {
1103 		/*
1104 		 * rctl_rule_release() is often called when iterating
1105 		 * over all the uidinfo structures in the system,
1106 		 * holding uihashtbl_lock.  Since rctl_rule_free()
1107 		 * might end up calling uifree(), this would lead
1108 		 * to lock recursion.  Use taskqueue to avoid this.
1109 		 */
1110 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1111 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1112 	}
1113 }
1114 
1115 static int
1116 rctl_rule_fully_specified(const struct rctl_rule *rule)
1117 {
1118 
1119 	ASSERT_RACCT_ENABLED();
1120 
1121 	switch (rule->rr_subject_type) {
1122 	case RCTL_SUBJECT_TYPE_UNDEFINED:
1123 		return (0);
1124 	case RCTL_SUBJECT_TYPE_PROCESS:
1125 		if (rule->rr_subject.rs_proc == NULL)
1126 			return (0);
1127 		break;
1128 	case RCTL_SUBJECT_TYPE_USER:
1129 		if (rule->rr_subject.rs_uip == NULL)
1130 			return (0);
1131 		break;
1132 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1133 		if (rule->rr_subject.rs_loginclass == NULL)
1134 			return (0);
1135 		break;
1136 	case RCTL_SUBJECT_TYPE_JAIL:
1137 		if (rule->rr_subject.rs_prison_racct == NULL)
1138 			return (0);
1139 		break;
1140 	default:
1141 		panic("rctl_rule_fully_specified: unknown subject type %d",
1142 		    rule->rr_subject_type);
1143 	}
1144 	if (rule->rr_resource == RACCT_UNDEFINED)
1145 		return (0);
1146 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1147 		return (0);
1148 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1149 		return (0);
1150 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1151 		return (0);
1152 
1153 	return (1);
1154 }
1155 
1156 static int
1157 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1158 {
1159 	struct rctl_rule *rule;
1160 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1161 	     *amountstr, *perstr;
1162 	id_t id;
1163 	int error = 0;
1164 
1165 	ASSERT_RACCT_ENABLED();
1166 
1167 	rule = rctl_rule_alloc(M_WAITOK);
1168 
1169 	subjectstr = strsep(&rulestr, ":");
1170 	subject_idstr = strsep(&rulestr, ":");
1171 	resourcestr = strsep(&rulestr, ":");
1172 	actionstr = strsep(&rulestr, "=/");
1173 	amountstr = strsep(&rulestr, "/");
1174 	perstr = rulestr;
1175 
1176 	if (subjectstr == NULL || subjectstr[0] == '\0')
1177 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1178 	else {
1179 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1180 		if (error != 0)
1181 			goto out;
1182 	}
1183 
1184 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1185 		rule->rr_subject.rs_proc = NULL;
1186 		rule->rr_subject.rs_uip = NULL;
1187 		rule->rr_subject.rs_loginclass = NULL;
1188 		rule->rr_subject.rs_prison_racct = NULL;
1189 	} else {
1190 		switch (rule->rr_subject_type) {
1191 		case RCTL_SUBJECT_TYPE_UNDEFINED:
1192 			error = EINVAL;
1193 			goto out;
1194 		case RCTL_SUBJECT_TYPE_PROCESS:
1195 			error = str2id(subject_idstr, &id);
1196 			if (error != 0)
1197 				goto out;
1198 			sx_assert(&allproc_lock, SA_LOCKED);
1199 			rule->rr_subject.rs_proc = pfind(id);
1200 			if (rule->rr_subject.rs_proc == NULL) {
1201 				error = ESRCH;
1202 				goto out;
1203 			}
1204 			PROC_UNLOCK(rule->rr_subject.rs_proc);
1205 			break;
1206 		case RCTL_SUBJECT_TYPE_USER:
1207 			error = str2id(subject_idstr, &id);
1208 			if (error != 0)
1209 				goto out;
1210 			rule->rr_subject.rs_uip = uifind(id);
1211 			break;
1212 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1213 			rule->rr_subject.rs_loginclass =
1214 			    loginclass_find(subject_idstr);
1215 			if (rule->rr_subject.rs_loginclass == NULL) {
1216 				error = ENAMETOOLONG;
1217 				goto out;
1218 			}
1219 			break;
1220 		case RCTL_SUBJECT_TYPE_JAIL:
1221 			rule->rr_subject.rs_prison_racct =
1222 			    prison_racct_find(subject_idstr);
1223 			if (rule->rr_subject.rs_prison_racct == NULL) {
1224 				error = ENAMETOOLONG;
1225 				goto out;
1226 			}
1227 			break;
1228                default:
1229                        panic("rctl_string_to_rule: unknown subject type %d",
1230                            rule->rr_subject_type);
1231                }
1232 	}
1233 
1234 	if (resourcestr == NULL || resourcestr[0] == '\0')
1235 		rule->rr_resource = RACCT_UNDEFINED;
1236 	else {
1237 		error = str2value(resourcestr, &rule->rr_resource,
1238 		    resourcenames);
1239 		if (error != 0)
1240 			goto out;
1241 	}
1242 
1243 	if (actionstr == NULL || actionstr[0] == '\0')
1244 		rule->rr_action = RCTL_ACTION_UNDEFINED;
1245 	else {
1246 		error = str2value(actionstr, &rule->rr_action, actionnames);
1247 		if (error != 0)
1248 			goto out;
1249 	}
1250 
1251 	if (amountstr == NULL || amountstr[0] == '\0')
1252 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1253 	else {
1254 		error = str2int64(amountstr, &rule->rr_amount);
1255 		if (error != 0)
1256 			goto out;
1257 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1258 			if (rule->rr_amount > INT64_MAX / 1000000) {
1259 				error = ERANGE;
1260 				goto out;
1261 			}
1262 			rule->rr_amount *= 1000000;
1263 		}
1264 	}
1265 
1266 	if (perstr == NULL || perstr[0] == '\0')
1267 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1268 	else {
1269 		error = str2value(perstr, &rule->rr_per, subjectnames);
1270 		if (error != 0)
1271 			goto out;
1272 	}
1273 
1274 out:
1275 	if (error == 0)
1276 		*rulep = rule;
1277 	else
1278 		rctl_rule_release(rule);
1279 
1280 	return (error);
1281 }
1282 
1283 /*
1284  * Link a rule with all the subjects it applies to.
1285  */
1286 int
1287 rctl_rule_add(struct rctl_rule *rule)
1288 {
1289 	struct proc *p;
1290 	struct ucred *cred;
1291 	struct uidinfo *uip;
1292 	struct prison *pr;
1293 	struct prison_racct *prr;
1294 	struct loginclass *lc;
1295 	struct rctl_rule *rule2;
1296 	int match;
1297 
1298 	ASSERT_RACCT_ENABLED();
1299 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1300 
1301 	/*
1302 	 * Some rules just don't make sense, like "deny" rule for an undeniable
1303 	 * resource.  The exception are the RSS and %CPU resources - they are
1304 	 * not deniable in the racct sense, but the limit is enforced in
1305 	 * a different way.
1306 	 */
1307 	if (rule->rr_action == RCTL_ACTION_DENY &&
1308 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1309 	    rule->rr_resource != RACCT_RSS &&
1310 	    rule->rr_resource != RACCT_PCTCPU) {
1311 		return (EOPNOTSUPP);
1312 	}
1313 
1314 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1315 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1316 		return (EOPNOTSUPP);
1317 	}
1318 
1319 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1320 	    rule->rr_resource == RACCT_PCTCPU) {
1321 		return (EOPNOTSUPP);
1322 	}
1323 
1324 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1325 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1326 		return (EOPNOTSUPP);
1327 	}
1328 
1329 	/*
1330 	 * Make sure there are no duplicated rules.  Also, for the "deny"
1331 	 * rules, remove ones differing only by "amount".
1332 	 */
1333 	if (rule->rr_action == RCTL_ACTION_DENY) {
1334 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1335 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1336 		rctl_rule_remove(rule2);
1337 		rctl_rule_release(rule2);
1338 	} else
1339 		rctl_rule_remove(rule);
1340 
1341 	switch (rule->rr_subject_type) {
1342 	case RCTL_SUBJECT_TYPE_PROCESS:
1343 		p = rule->rr_subject.rs_proc;
1344 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1345 
1346 		rctl_racct_add_rule(p->p_racct, rule);
1347 		/*
1348 		 * In case of per-process rule, we don't have anything more
1349 		 * to do.
1350 		 */
1351 		return (0);
1352 
1353 	case RCTL_SUBJECT_TYPE_USER:
1354 		uip = rule->rr_subject.rs_uip;
1355 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1356 		rctl_racct_add_rule(uip->ui_racct, rule);
1357 		break;
1358 
1359 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1360 		lc = rule->rr_subject.rs_loginclass;
1361 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1362 		rctl_racct_add_rule(lc->lc_racct, rule);
1363 		break;
1364 
1365 	case RCTL_SUBJECT_TYPE_JAIL:
1366 		prr = rule->rr_subject.rs_prison_racct;
1367 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1368 		rctl_racct_add_rule(prr->prr_racct, rule);
1369 		break;
1370 
1371 	default:
1372 		panic("rctl_rule_add: unknown subject type %d",
1373 		    rule->rr_subject_type);
1374 	}
1375 
1376 	/*
1377 	 * Now go through all the processes and add the new rule to the ones
1378 	 * it applies to.
1379 	 */
1380 	sx_assert(&allproc_lock, SA_LOCKED);
1381 	FOREACH_PROC_IN_SYSTEM(p) {
1382 		cred = p->p_ucred;
1383 		switch (rule->rr_subject_type) {
1384 		case RCTL_SUBJECT_TYPE_USER:
1385 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1386 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1387 				break;
1388 			continue;
1389 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1390 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1391 				break;
1392 			continue;
1393 		case RCTL_SUBJECT_TYPE_JAIL:
1394 			match = 0;
1395 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1396 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1397 					match = 1;
1398 					break;
1399 				}
1400 			}
1401 			if (match)
1402 				break;
1403 			continue;
1404 		default:
1405 			panic("rctl_rule_add: unknown subject type %d",
1406 			    rule->rr_subject_type);
1407 		}
1408 
1409 		rctl_racct_add_rule(p->p_racct, rule);
1410 	}
1411 
1412 	return (0);
1413 }
1414 
1415 static void
1416 rctl_rule_pre_callback(void)
1417 {
1418 
1419 	RCTL_WLOCK();
1420 }
1421 
1422 static void
1423 rctl_rule_post_callback(void)
1424 {
1425 
1426 	RCTL_WUNLOCK();
1427 }
1428 
1429 static void
1430 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1431 {
1432 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1433 	int found = 0;
1434 
1435 	ASSERT_RACCT_ENABLED();
1436 	RCTL_WLOCK_ASSERT();
1437 
1438 	found += rctl_racct_remove_rules(racct, filter);
1439 
1440 	*((int *)arg3) += found;
1441 }
1442 
1443 /*
1444  * Remove all rules that match the filter.
1445  */
1446 int
1447 rctl_rule_remove(struct rctl_rule *filter)
1448 {
1449 	struct proc *p;
1450 	int found = 0;
1451 
1452 	ASSERT_RACCT_ENABLED();
1453 
1454 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1455 	    filter->rr_subject.rs_proc != NULL) {
1456 		p = filter->rr_subject.rs_proc;
1457 		RCTL_WLOCK();
1458 		found = rctl_racct_remove_rules(p->p_racct, filter);
1459 		RCTL_WUNLOCK();
1460 		if (found)
1461 			return (0);
1462 		return (ESRCH);
1463 	}
1464 
1465 	loginclass_racct_foreach(rctl_rule_remove_callback,
1466 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1467 	    filter, (void *)&found);
1468 	ui_racct_foreach(rctl_rule_remove_callback,
1469 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1470 	    filter, (void *)&found);
1471 	prison_racct_foreach(rctl_rule_remove_callback,
1472 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1473 	    filter, (void *)&found);
1474 
1475 	sx_assert(&allproc_lock, SA_LOCKED);
1476 	RCTL_WLOCK();
1477 	FOREACH_PROC_IN_SYSTEM(p) {
1478 		found += rctl_racct_remove_rules(p->p_racct, filter);
1479 	}
1480 	RCTL_WUNLOCK();
1481 
1482 	if (found)
1483 		return (0);
1484 	return (ESRCH);
1485 }
1486 
1487 /*
1488  * Appends a rule to the sbuf.
1489  */
1490 static void
1491 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1492 {
1493 	int64_t amount;
1494 
1495 	ASSERT_RACCT_ENABLED();
1496 
1497 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1498 
1499 	switch (rule->rr_subject_type) {
1500 	case RCTL_SUBJECT_TYPE_PROCESS:
1501 		if (rule->rr_subject.rs_proc == NULL)
1502 			sbuf_printf(sb, ":");
1503 		else
1504 			sbuf_printf(sb, "%d:",
1505 			    rule->rr_subject.rs_proc->p_pid);
1506 		break;
1507 	case RCTL_SUBJECT_TYPE_USER:
1508 		if (rule->rr_subject.rs_uip == NULL)
1509 			sbuf_printf(sb, ":");
1510 		else
1511 			sbuf_printf(sb, "%d:",
1512 			    rule->rr_subject.rs_uip->ui_uid);
1513 		break;
1514 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1515 		if (rule->rr_subject.rs_loginclass == NULL)
1516 			sbuf_printf(sb, ":");
1517 		else
1518 			sbuf_printf(sb, "%s:",
1519 			    rule->rr_subject.rs_loginclass->lc_name);
1520 		break;
1521 	case RCTL_SUBJECT_TYPE_JAIL:
1522 		if (rule->rr_subject.rs_prison_racct == NULL)
1523 			sbuf_printf(sb, ":");
1524 		else
1525 			sbuf_printf(sb, "%s:",
1526 			    rule->rr_subject.rs_prison_racct->prr_name);
1527 		break;
1528 	default:
1529 		panic("rctl_rule_to_sbuf: unknown subject type %d",
1530 		    rule->rr_subject_type);
1531 	}
1532 
1533 	amount = rule->rr_amount;
1534 	if (amount != RCTL_AMOUNT_UNDEFINED &&
1535 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1536 		amount /= 1000000;
1537 
1538 	sbuf_printf(sb, "%s:%s=%jd",
1539 	    rctl_resource_name(rule->rr_resource),
1540 	    rctl_action_name(rule->rr_action),
1541 	    amount);
1542 
1543 	if (rule->rr_per != rule->rr_subject_type)
1544 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1545 }
1546 
1547 /*
1548  * Routine used by RCTL syscalls to read in input string.
1549  */
1550 static int
1551 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1552 {
1553 	char *str;
1554 	int error;
1555 
1556 	ASSERT_RACCT_ENABLED();
1557 
1558 	if (inbuflen <= 0)
1559 		return (EINVAL);
1560 	if (inbuflen > RCTL_MAX_INBUFSIZE)
1561 		return (E2BIG);
1562 
1563 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1564 	error = copyinstr(inbufp, str, inbuflen, NULL);
1565 	if (error != 0) {
1566 		free(str, M_RCTL);
1567 		return (error);
1568 	}
1569 
1570 	*inputstr = str;
1571 
1572 	return (0);
1573 }
1574 
1575 /*
1576  * Routine used by RCTL syscalls to write out output string.
1577  */
1578 static int
1579 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1580 {
1581 	int error;
1582 
1583 	ASSERT_RACCT_ENABLED();
1584 
1585 	if (outputsbuf == NULL)
1586 		return (0);
1587 
1588 	sbuf_finish(outputsbuf);
1589 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1590 		sbuf_delete(outputsbuf);
1591 		return (ERANGE);
1592 	}
1593 	error = copyout(sbuf_data(outputsbuf), outbufp,
1594 	    sbuf_len(outputsbuf) + 1);
1595 	sbuf_delete(outputsbuf);
1596 	return (error);
1597 }
1598 
1599 static struct sbuf *
1600 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1601 {
1602 	struct sbuf *sb;
1603 	int64_t amount;
1604 	int i;
1605 
1606 	ASSERT_RACCT_ENABLED();
1607 
1608 	sb = sbuf_new_auto();
1609 	for (i = 0; i <= RACCT_MAX; i++) {
1610 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1611 			continue;
1612 		amount = racct->r_resources[i];
1613 		if (RACCT_IS_IN_MILLIONS(i))
1614 			amount /= 1000000;
1615 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1616 	}
1617 	sbuf_setpos(sb, sbuf_len(sb) - 1);
1618 	return (sb);
1619 }
1620 
1621 int
1622 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1623 {
1624 	struct rctl_rule *filter;
1625 	struct sbuf *outputsbuf = NULL;
1626 	struct proc *p;
1627 	struct uidinfo *uip;
1628 	struct loginclass *lc;
1629 	struct prison_racct *prr;
1630 	char *inputstr;
1631 	int error;
1632 
1633 	if (!racct_enable)
1634 		return (ENOSYS);
1635 
1636 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1637 	if (error != 0)
1638 		return (error);
1639 
1640 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1641 	if (error != 0)
1642 		return (error);
1643 
1644 	sx_slock(&allproc_lock);
1645 	error = rctl_string_to_rule(inputstr, &filter);
1646 	free(inputstr, M_RCTL);
1647 	if (error != 0) {
1648 		sx_sunlock(&allproc_lock);
1649 		return (error);
1650 	}
1651 
1652 	switch (filter->rr_subject_type) {
1653 	case RCTL_SUBJECT_TYPE_PROCESS:
1654 		p = filter->rr_subject.rs_proc;
1655 		if (p == NULL) {
1656 			error = EINVAL;
1657 			goto out;
1658 		}
1659 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1660 		break;
1661 	case RCTL_SUBJECT_TYPE_USER:
1662 		uip = filter->rr_subject.rs_uip;
1663 		if (uip == NULL) {
1664 			error = EINVAL;
1665 			goto out;
1666 		}
1667 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1668 		break;
1669 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1670 		lc = filter->rr_subject.rs_loginclass;
1671 		if (lc == NULL) {
1672 			error = EINVAL;
1673 			goto out;
1674 		}
1675 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1676 		break;
1677 	case RCTL_SUBJECT_TYPE_JAIL:
1678 		prr = filter->rr_subject.rs_prison_racct;
1679 		if (prr == NULL) {
1680 			error = EINVAL;
1681 			goto out;
1682 		}
1683 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1684 		break;
1685 	default:
1686 		error = EINVAL;
1687 	}
1688 out:
1689 	rctl_rule_release(filter);
1690 	sx_sunlock(&allproc_lock);
1691 	if (error != 0)
1692 		return (error);
1693 
1694 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1695 
1696 	return (error);
1697 }
1698 
1699 static void
1700 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1701 {
1702 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1703 	struct rctl_rule_link *link;
1704 	struct sbuf *sb = (struct sbuf *)arg3;
1705 
1706 	ASSERT_RACCT_ENABLED();
1707 	RCTL_LOCK_ASSERT();
1708 
1709 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1710 		if (!rctl_rule_matches(link->rrl_rule, filter))
1711 			continue;
1712 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1713 		sbuf_printf(sb, ",");
1714 	}
1715 }
1716 
1717 int
1718 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1719 {
1720 	struct sbuf *sb;
1721 	struct rctl_rule *filter;
1722 	struct rctl_rule_link *link;
1723 	struct proc *p;
1724 	char *inputstr, *buf;
1725 	size_t bufsize;
1726 	int error;
1727 
1728 	if (!racct_enable)
1729 		return (ENOSYS);
1730 
1731 	error = priv_check(td, PRIV_RCTL_GET_RULES);
1732 	if (error != 0)
1733 		return (error);
1734 
1735 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1736 	if (error != 0)
1737 		return (error);
1738 
1739 	sx_slock(&allproc_lock);
1740 	error = rctl_string_to_rule(inputstr, &filter);
1741 	free(inputstr, M_RCTL);
1742 	if (error != 0) {
1743 		sx_sunlock(&allproc_lock);
1744 		return (error);
1745 	}
1746 
1747 	bufsize = uap->outbuflen;
1748 	if (bufsize > rctl_maxbufsize) {
1749 		sx_sunlock(&allproc_lock);
1750 		return (E2BIG);
1751 	}
1752 
1753 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1754 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1755 	KASSERT(sb != NULL, ("sbuf_new failed"));
1756 
1757 	FOREACH_PROC_IN_SYSTEM(p) {
1758 		RCTL_RLOCK();
1759 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1760 			/*
1761 			 * Non-process rules will be added to the buffer later.
1762 			 * Adding them here would result in duplicated output.
1763 			 */
1764 			if (link->rrl_rule->rr_subject_type !=
1765 			    RCTL_SUBJECT_TYPE_PROCESS)
1766 				continue;
1767 			if (!rctl_rule_matches(link->rrl_rule, filter))
1768 				continue;
1769 			rctl_rule_to_sbuf(sb, link->rrl_rule);
1770 			sbuf_printf(sb, ",");
1771 		}
1772 		RCTL_RUNLOCK();
1773 	}
1774 
1775 	loginclass_racct_foreach(rctl_get_rules_callback,
1776 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1777 	    filter, sb);
1778 	ui_racct_foreach(rctl_get_rules_callback,
1779 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1780 	    filter, sb);
1781 	prison_racct_foreach(rctl_get_rules_callback,
1782 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1783 	    filter, sb);
1784 	if (sbuf_error(sb) == ENOMEM) {
1785 		error = ERANGE;
1786 		goto out;
1787 	}
1788 
1789 	/*
1790 	 * Remove trailing ",".
1791 	 */
1792 	if (sbuf_len(sb) > 0)
1793 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1794 
1795 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1796 out:
1797 	rctl_rule_release(filter);
1798 	sx_sunlock(&allproc_lock);
1799 	free(buf, M_RCTL);
1800 	return (error);
1801 }
1802 
1803 int
1804 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1805 {
1806 	struct sbuf *sb;
1807 	struct rctl_rule *filter;
1808 	struct rctl_rule_link *link;
1809 	char *inputstr, *buf;
1810 	size_t bufsize;
1811 	int error;
1812 
1813 	if (!racct_enable)
1814 		return (ENOSYS);
1815 
1816 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1817 	if (error != 0)
1818 		return (error);
1819 
1820 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1821 	if (error != 0)
1822 		return (error);
1823 
1824 	sx_slock(&allproc_lock);
1825 	error = rctl_string_to_rule(inputstr, &filter);
1826 	free(inputstr, M_RCTL);
1827 	if (error != 0) {
1828 		sx_sunlock(&allproc_lock);
1829 		return (error);
1830 	}
1831 
1832 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1833 		rctl_rule_release(filter);
1834 		sx_sunlock(&allproc_lock);
1835 		return (EINVAL);
1836 	}
1837 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1838 		rctl_rule_release(filter);
1839 		sx_sunlock(&allproc_lock);
1840 		return (EOPNOTSUPP);
1841 	}
1842 	if (filter->rr_subject.rs_proc == NULL) {
1843 		rctl_rule_release(filter);
1844 		sx_sunlock(&allproc_lock);
1845 		return (EINVAL);
1846 	}
1847 
1848 	bufsize = uap->outbuflen;
1849 	if (bufsize > rctl_maxbufsize) {
1850 		rctl_rule_release(filter);
1851 		sx_sunlock(&allproc_lock);
1852 		return (E2BIG);
1853 	}
1854 
1855 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1856 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1857 	KASSERT(sb != NULL, ("sbuf_new failed"));
1858 
1859 	RCTL_RLOCK();
1860 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1861 	    rrl_next) {
1862 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1863 		sbuf_printf(sb, ",");
1864 	}
1865 	RCTL_RUNLOCK();
1866 	if (sbuf_error(sb) == ENOMEM) {
1867 		error = ERANGE;
1868 		goto out;
1869 	}
1870 
1871 	/*
1872 	 * Remove trailing ",".
1873 	 */
1874 	if (sbuf_len(sb) > 0)
1875 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1876 
1877 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1878 out:
1879 	rctl_rule_release(filter);
1880 	sx_sunlock(&allproc_lock);
1881 	free(buf, M_RCTL);
1882 	return (error);
1883 }
1884 
1885 int
1886 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1887 {
1888 	struct rctl_rule *rule;
1889 	char *inputstr;
1890 	int error;
1891 
1892 	if (!racct_enable)
1893 		return (ENOSYS);
1894 
1895 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1896 	if (error != 0)
1897 		return (error);
1898 
1899 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1900 	if (error != 0)
1901 		return (error);
1902 
1903 	sx_slock(&allproc_lock);
1904 	error = rctl_string_to_rule(inputstr, &rule);
1905 	free(inputstr, M_RCTL);
1906 	if (error != 0) {
1907 		sx_sunlock(&allproc_lock);
1908 		return (error);
1909 	}
1910 	/*
1911 	 * The 'per' part of a rule is optional.
1912 	 */
1913 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1914 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1915 		rule->rr_per = rule->rr_subject_type;
1916 
1917 	if (!rctl_rule_fully_specified(rule)) {
1918 		error = EINVAL;
1919 		goto out;
1920 	}
1921 
1922 	error = rctl_rule_add(rule);
1923 
1924 out:
1925 	rctl_rule_release(rule);
1926 	sx_sunlock(&allproc_lock);
1927 	return (error);
1928 }
1929 
1930 int
1931 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1932 {
1933 	struct rctl_rule *filter;
1934 	char *inputstr;
1935 	int error;
1936 
1937 	if (!racct_enable)
1938 		return (ENOSYS);
1939 
1940 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1941 	if (error != 0)
1942 		return (error);
1943 
1944 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1945 	if (error != 0)
1946 		return (error);
1947 
1948 	sx_slock(&allproc_lock);
1949 	error = rctl_string_to_rule(inputstr, &filter);
1950 	free(inputstr, M_RCTL);
1951 	if (error != 0) {
1952 		sx_sunlock(&allproc_lock);
1953 		return (error);
1954 	}
1955 
1956 	error = rctl_rule_remove(filter);
1957 	rctl_rule_release(filter);
1958 	sx_sunlock(&allproc_lock);
1959 
1960 	return (error);
1961 }
1962 
1963 /*
1964  * Update RCTL rule list after credential change.
1965  */
1966 void
1967 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1968 {
1969 	LIST_HEAD(, rctl_rule_link) newrules;
1970 	struct rctl_rule_link *link, *newlink;
1971 	struct uidinfo *newuip;
1972 	struct loginclass *newlc;
1973 	struct prison_racct *newprr;
1974 	int rulecnt, i;
1975 
1976 	ASSERT_RACCT_ENABLED();
1977 
1978 	newuip = newcred->cr_ruidinfo;
1979 	newlc = newcred->cr_loginclass;
1980 	newprr = newcred->cr_prison->pr_prison_racct;
1981 
1982 	LIST_INIT(&newrules);
1983 
1984 again:
1985 	/*
1986 	 * First, count the rules that apply to the process with new
1987 	 * credentials.
1988 	 */
1989 	rulecnt = 0;
1990 	RCTL_RLOCK();
1991 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1992 		if (link->rrl_rule->rr_subject_type ==
1993 		    RCTL_SUBJECT_TYPE_PROCESS)
1994 			rulecnt++;
1995 	}
1996 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1997 		rulecnt++;
1998 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1999 		rulecnt++;
2000 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
2001 		rulecnt++;
2002 	RCTL_RUNLOCK();
2003 
2004 	/*
2005 	 * Create temporary list.  We've dropped the rctl_lock in order
2006 	 * to use M_WAITOK.
2007 	 */
2008 	for (i = 0; i < rulecnt; i++) {
2009 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2010 		newlink->rrl_rule = NULL;
2011 		newlink->rrl_exceeded = 0;
2012 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2013 	}
2014 
2015 	newlink = LIST_FIRST(&newrules);
2016 
2017 	/*
2018 	 * Assign rules to the newly allocated list entries.
2019 	 */
2020 	RCTL_WLOCK();
2021 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2022 		if (link->rrl_rule->rr_subject_type ==
2023 		    RCTL_SUBJECT_TYPE_PROCESS) {
2024 			if (newlink == NULL)
2025 				goto goaround;
2026 			rctl_rule_acquire(link->rrl_rule);
2027 			newlink->rrl_rule = link->rrl_rule;
2028 			newlink->rrl_exceeded = link->rrl_exceeded;
2029 			newlink = LIST_NEXT(newlink, rrl_next);
2030 			rulecnt--;
2031 		}
2032 	}
2033 
2034 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2035 		if (newlink == NULL)
2036 			goto goaround;
2037 		rctl_rule_acquire(link->rrl_rule);
2038 		newlink->rrl_rule = link->rrl_rule;
2039 		newlink->rrl_exceeded = link->rrl_exceeded;
2040 		newlink = LIST_NEXT(newlink, rrl_next);
2041 		rulecnt--;
2042 	}
2043 
2044 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2045 		if (newlink == NULL)
2046 			goto goaround;
2047 		rctl_rule_acquire(link->rrl_rule);
2048 		newlink->rrl_rule = link->rrl_rule;
2049 		newlink->rrl_exceeded = link->rrl_exceeded;
2050 		newlink = LIST_NEXT(newlink, rrl_next);
2051 		rulecnt--;
2052 	}
2053 
2054 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2055 		if (newlink == NULL)
2056 			goto goaround;
2057 		rctl_rule_acquire(link->rrl_rule);
2058 		newlink->rrl_rule = link->rrl_rule;
2059 		newlink->rrl_exceeded = link->rrl_exceeded;
2060 		newlink = LIST_NEXT(newlink, rrl_next);
2061 		rulecnt--;
2062 	}
2063 
2064 	if (rulecnt == 0) {
2065 		/*
2066 		 * Free the old rule list.
2067 		 */
2068 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2069 			link = LIST_FIRST(&p->p_racct->r_rule_links);
2070 			LIST_REMOVE(link, rrl_next);
2071 			rctl_rule_release(link->rrl_rule);
2072 			uma_zfree(rctl_rule_link_zone, link);
2073 		}
2074 
2075 		/*
2076 		 * Replace lists and we're done.
2077 		 *
2078 		 * XXX: Is there any way to switch list heads instead
2079 		 *      of iterating here?
2080 		 */
2081 		while (!LIST_EMPTY(&newrules)) {
2082 			newlink = LIST_FIRST(&newrules);
2083 			LIST_REMOVE(newlink, rrl_next);
2084 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2085 			    newlink, rrl_next);
2086 		}
2087 
2088 		RCTL_WUNLOCK();
2089 
2090 		return;
2091 	}
2092 
2093 goaround:
2094 	RCTL_WUNLOCK();
2095 
2096 	/*
2097 	 * Rule list changed while we were not holding the rctl_lock.
2098 	 * Free the new list and try again.
2099 	 */
2100 	while (!LIST_EMPTY(&newrules)) {
2101 		newlink = LIST_FIRST(&newrules);
2102 		LIST_REMOVE(newlink, rrl_next);
2103 		if (newlink->rrl_rule != NULL)
2104 			rctl_rule_release(newlink->rrl_rule);
2105 		uma_zfree(rctl_rule_link_zone, newlink);
2106 	}
2107 
2108 	goto again;
2109 }
2110 
2111 /*
2112  * Assign RCTL rules to the newly created process.
2113  */
2114 int
2115 rctl_proc_fork(struct proc *parent, struct proc *child)
2116 {
2117 	struct rctl_rule *rule;
2118 	struct rctl_rule_link *link;
2119 	int error;
2120 
2121 	LIST_INIT(&child->p_racct->r_rule_links);
2122 
2123 	ASSERT_RACCT_ENABLED();
2124 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2125 
2126 	RCTL_WLOCK();
2127 
2128 	/*
2129 	 * Go through limits applicable to the parent and assign them
2130 	 * to the child.  Rules with 'process' subject have to be duplicated
2131 	 * in order to make their rr_subject point to the new process.
2132 	 */
2133 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2134 		if (link->rrl_rule->rr_subject_type ==
2135 		    RCTL_SUBJECT_TYPE_PROCESS) {
2136 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2137 			if (rule == NULL)
2138 				goto fail;
2139 			KASSERT(rule->rr_subject.rs_proc == parent,
2140 			    ("rule->rr_subject.rs_proc != parent"));
2141 			rule->rr_subject.rs_proc = child;
2142 			error = rctl_racct_add_rule_locked(child->p_racct,
2143 			    rule);
2144 			rctl_rule_release(rule);
2145 			if (error != 0)
2146 				goto fail;
2147 		} else {
2148 			error = rctl_racct_add_rule_locked(child->p_racct,
2149 			    link->rrl_rule);
2150 			if (error != 0)
2151 				goto fail;
2152 		}
2153 	}
2154 
2155 	RCTL_WUNLOCK();
2156 	return (0);
2157 
2158 fail:
2159 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2160 		link = LIST_FIRST(&child->p_racct->r_rule_links);
2161 		LIST_REMOVE(link, rrl_next);
2162 		rctl_rule_release(link->rrl_rule);
2163 		uma_zfree(rctl_rule_link_zone, link);
2164 	}
2165 	RCTL_WUNLOCK();
2166 	return (EAGAIN);
2167 }
2168 
2169 /*
2170  * Release rules attached to the racct.
2171  */
2172 void
2173 rctl_racct_release(struct racct *racct)
2174 {
2175 	struct rctl_rule_link *link;
2176 
2177 	ASSERT_RACCT_ENABLED();
2178 
2179 	RCTL_WLOCK();
2180 	while (!LIST_EMPTY(&racct->r_rule_links)) {
2181 		link = LIST_FIRST(&racct->r_rule_links);
2182 		LIST_REMOVE(link, rrl_next);
2183 		rctl_rule_release(link->rrl_rule);
2184 		uma_zfree(rctl_rule_link_zone, link);
2185 	}
2186 	RCTL_WUNLOCK();
2187 }
2188 
2189 static void
2190 rctl_init(void)
2191 {
2192 
2193 	if (!racct_enable)
2194 		return;
2195 
2196 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2197 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2198 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2199 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2200 	    UMA_ALIGN_PTR, 0);
2201 
2202 	/*
2203 	 * Set default values, making sure not to overwrite the ones
2204 	 * fetched from tunables.  Most of those could be set at the
2205 	 * declaration, except for the rctl_throttle_max - we cannot
2206 	 * set it there due to hz not being compile time constant.
2207 	 */
2208 	if (rctl_throttle_min < 1)
2209 		rctl_throttle_min = 1;
2210 	if (rctl_throttle_max < rctl_throttle_min)
2211 		rctl_throttle_max = 2 * hz;
2212 	if (rctl_throttle_pct < 0)
2213 		rctl_throttle_pct = 100;
2214 	if (rctl_throttle_pct2 < 0)
2215 		rctl_throttle_pct2 = 100;
2216 }
2217 
2218 #else /* !RCTL */
2219 
2220 int
2221 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2222 {
2223 
2224 	return (ENOSYS);
2225 }
2226 
2227 int
2228 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2229 {
2230 
2231 	return (ENOSYS);
2232 }
2233 
2234 int
2235 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2236 {
2237 
2238 	return (ENOSYS);
2239 }
2240 
2241 int
2242 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2243 {
2244 
2245 	return (ENOSYS);
2246 }
2247 
2248 int
2249 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2250 {
2251 
2252 	return (ENOSYS);
2253 }
2254 
2255 #endif /* !RCTL */
2256