xref: /freebsd/sys/kern/kern_rctl.c (revision b3e7694832e81d7a904a10f525f8797b753bf0d3)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2010 The FreeBSD Foundation
5  *
6  * This software was developed by Edward Tomasz Napierala under sponsorship
7  * from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/devctl.h>
36 #include <sys/malloc.h>
37 #include <sys/queue.h>
38 #include <sys/refcount.h>
39 #include <sys/jail.h>
40 #include <sys/kernel.h>
41 #include <sys/limits.h>
42 #include <sys/loginclass.h>
43 #include <sys/priv.h>
44 #include <sys/proc.h>
45 #include <sys/racct.h>
46 #include <sys/rctl.h>
47 #include <sys/resourcevar.h>
48 #include <sys/sx.h>
49 #include <sys/sysproto.h>
50 #include <sys/systm.h>
51 #include <sys/types.h>
52 #include <sys/eventhandler.h>
53 #include <sys/lock.h>
54 #include <sys/mutex.h>
55 #include <sys/rwlock.h>
56 #include <sys/sbuf.h>
57 #include <sys/taskqueue.h>
58 #include <sys/tree.h>
59 #include <vm/uma.h>
60 
61 #ifdef RCTL
62 #ifndef RACCT
63 #error "The RCTL option requires the RACCT option"
64 #endif
65 
66 FEATURE(rctl, "Resource Limits");
67 
68 #define	HRF_DEFAULT		0
69 #define	HRF_DONT_INHERIT	1
70 #define	HRF_DONT_ACCUMULATE	2
71 
72 #define	RCTL_MAX_INBUFSIZE	4 * 1024
73 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
74 #define	RCTL_LOG_BUFSIZE	128
75 
76 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
77 
78 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
79 static int rctl_log_rate_limit = 10;
80 static int rctl_devctl_rate_limit = 10;
81 
82 /*
83  * Values below are initialized in rctl_init().
84  */
85 static int rctl_throttle_min = -1;
86 static int rctl_throttle_max = -1;
87 static int rctl_throttle_pct = -1;
88 static int rctl_throttle_pct2 = -1;
89 
90 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
91 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
92 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
93 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
94 
95 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
96     "Resource Limits");
97 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
98     &rctl_maxbufsize, 0, "Maximum output buffer size");
99 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
100     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
101 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
102     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
103 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
104     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
105     &rctl_throttle_min_sysctl, "IU",
106     "Shortest throttling duration, in hz");
107 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
108 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
109     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
110     &rctl_throttle_max_sysctl, "IU",
111     "Longest throttling duration, in hz");
112 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
113 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
114     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
115     &rctl_throttle_pct_sysctl, "IU",
116     "Throttling penalty for process consumption, in percent");
117 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
118 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
119     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
120     &rctl_throttle_pct2_sysctl, "IU",
121     "Throttling penalty for container consumption, in percent");
122 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
123 
124 /*
125  * 'rctl_rule_link' connects a rule with every racct it's related to.
126  * For example, rule 'user:X:openfiles:deny=N/process' is linked
127  * with uidinfo for user X, and to each process of that user.
128  */
129 struct rctl_rule_link {
130 	LIST_ENTRY(rctl_rule_link)	rrl_next;
131 	struct rctl_rule		*rrl_rule;
132 	int				rrl_exceeded;
133 };
134 
135 struct dict {
136 	const char	*d_name;
137 	int		d_value;
138 };
139 
140 static struct dict subjectnames[] = {
141 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
142 	{ "user", RCTL_SUBJECT_TYPE_USER },
143 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
144 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
145 	{ NULL, -1 }};
146 
147 static struct dict resourcenames[] = {
148 	{ "cputime", RACCT_CPU },
149 	{ "datasize", RACCT_DATA },
150 	{ "stacksize", RACCT_STACK },
151 	{ "coredumpsize", RACCT_CORE },
152 	{ "memoryuse", RACCT_RSS },
153 	{ "memorylocked", RACCT_MEMLOCK },
154 	{ "maxproc", RACCT_NPROC },
155 	{ "openfiles", RACCT_NOFILE },
156 	{ "vmemoryuse", RACCT_VMEM },
157 	{ "pseudoterminals", RACCT_NPTS },
158 	{ "swapuse", RACCT_SWAP },
159 	{ "nthr", RACCT_NTHR },
160 	{ "msgqqueued", RACCT_MSGQQUEUED },
161 	{ "msgqsize", RACCT_MSGQSIZE },
162 	{ "nmsgq", RACCT_NMSGQ },
163 	{ "nsem", RACCT_NSEM },
164 	{ "nsemop", RACCT_NSEMOP },
165 	{ "nshm", RACCT_NSHM },
166 	{ "shmsize", RACCT_SHMSIZE },
167 	{ "wallclock", RACCT_WALLCLOCK },
168 	{ "pcpu", RACCT_PCTCPU },
169 	{ "readbps", RACCT_READBPS },
170 	{ "writebps", RACCT_WRITEBPS },
171 	{ "readiops", RACCT_READIOPS },
172 	{ "writeiops", RACCT_WRITEIOPS },
173 	{ NULL, -1 }};
174 
175 static struct dict actionnames[] = {
176 	{ "sighup", RCTL_ACTION_SIGHUP },
177 	{ "sigint", RCTL_ACTION_SIGINT },
178 	{ "sigquit", RCTL_ACTION_SIGQUIT },
179 	{ "sigill", RCTL_ACTION_SIGILL },
180 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
181 	{ "sigabrt", RCTL_ACTION_SIGABRT },
182 	{ "sigemt", RCTL_ACTION_SIGEMT },
183 	{ "sigfpe", RCTL_ACTION_SIGFPE },
184 	{ "sigkill", RCTL_ACTION_SIGKILL },
185 	{ "sigbus", RCTL_ACTION_SIGBUS },
186 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
187 	{ "sigsys", RCTL_ACTION_SIGSYS },
188 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
189 	{ "sigalrm", RCTL_ACTION_SIGALRM },
190 	{ "sigterm", RCTL_ACTION_SIGTERM },
191 	{ "sigurg", RCTL_ACTION_SIGURG },
192 	{ "sigstop", RCTL_ACTION_SIGSTOP },
193 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
194 	{ "sigchld", RCTL_ACTION_SIGCHLD },
195 	{ "sigttin", RCTL_ACTION_SIGTTIN },
196 	{ "sigttou", RCTL_ACTION_SIGTTOU },
197 	{ "sigio", RCTL_ACTION_SIGIO },
198 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
199 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
200 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
201 	{ "sigprof", RCTL_ACTION_SIGPROF },
202 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
203 	{ "siginfo", RCTL_ACTION_SIGINFO },
204 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
205 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
206 	{ "sigthr", RCTL_ACTION_SIGTHR },
207 	{ "deny", RCTL_ACTION_DENY },
208 	{ "log", RCTL_ACTION_LOG },
209 	{ "devctl", RCTL_ACTION_DEVCTL },
210 	{ "throttle", RCTL_ACTION_THROTTLE },
211 	{ NULL, -1 }};
212 
213 static void rctl_init(void);
214 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
215 
216 static uma_zone_t rctl_rule_zone;
217 static uma_zone_t rctl_rule_link_zone;
218 
219 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
220 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
221 
222 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
223 
224 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
225 {
226 	int error, val = rctl_throttle_min;
227 
228 	error = sysctl_handle_int(oidp, &val, 0, req);
229 	if (error || !req->newptr)
230 		return (error);
231 	if (val < 1 || val > rctl_throttle_max)
232 		return (EINVAL);
233 
234 	RACCT_LOCK();
235 	rctl_throttle_min = val;
236 	RACCT_UNLOCK();
237 
238 	return (0);
239 }
240 
241 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
242 {
243 	int error, val = rctl_throttle_max;
244 
245 	error = sysctl_handle_int(oidp, &val, 0, req);
246 	if (error || !req->newptr)
247 		return (error);
248 	if (val < rctl_throttle_min)
249 		return (EINVAL);
250 
251 	RACCT_LOCK();
252 	rctl_throttle_max = val;
253 	RACCT_UNLOCK();
254 
255 	return (0);
256 }
257 
258 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
259 {
260 	int error, val = rctl_throttle_pct;
261 
262 	error = sysctl_handle_int(oidp, &val, 0, req);
263 	if (error || !req->newptr)
264 		return (error);
265 	if (val < 0)
266 		return (EINVAL);
267 
268 	RACCT_LOCK();
269 	rctl_throttle_pct = val;
270 	RACCT_UNLOCK();
271 
272 	return (0);
273 }
274 
275 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
276 {
277 	int error, val = rctl_throttle_pct2;
278 
279 	error = sysctl_handle_int(oidp, &val, 0, req);
280 	if (error || !req->newptr)
281 		return (error);
282 	if (val < 0)
283 		return (EINVAL);
284 
285 	RACCT_LOCK();
286 	rctl_throttle_pct2 = val;
287 	RACCT_UNLOCK();
288 
289 	return (0);
290 }
291 
292 static const char *
293 rctl_subject_type_name(int subject)
294 {
295 	int i;
296 
297 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
298 		if (subjectnames[i].d_value == subject)
299 			return (subjectnames[i].d_name);
300 	}
301 
302 	panic("rctl_subject_type_name: unknown subject type %d", subject);
303 }
304 
305 static const char *
306 rctl_action_name(int action)
307 {
308 	int i;
309 
310 	for (i = 0; actionnames[i].d_name != NULL; i++) {
311 		if (actionnames[i].d_value == action)
312 			return (actionnames[i].d_name);
313 	}
314 
315 	panic("rctl_action_name: unknown action %d", action);
316 }
317 
318 const char *
319 rctl_resource_name(int resource)
320 {
321 	int i;
322 
323 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
324 		if (resourcenames[i].d_value == resource)
325 			return (resourcenames[i].d_name);
326 	}
327 
328 	panic("rctl_resource_name: unknown resource %d", resource);
329 }
330 
331 static struct racct *
332 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
333 {
334 	struct ucred *cred = p->p_ucred;
335 
336 	ASSERT_RACCT_ENABLED();
337 	RACCT_LOCK_ASSERT();
338 
339 	switch (rule->rr_per) {
340 	case RCTL_SUBJECT_TYPE_PROCESS:
341 		return (p->p_racct);
342 	case RCTL_SUBJECT_TYPE_USER:
343 		return (cred->cr_ruidinfo->ui_racct);
344 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
345 		return (cred->cr_loginclass->lc_racct);
346 	case RCTL_SUBJECT_TYPE_JAIL:
347 		return (cred->cr_prison->pr_prison_racct->prr_racct);
348 	default:
349 		panic("%s: unknown per %d", __func__, rule->rr_per);
350 	}
351 }
352 
353 /*
354  * Return the amount of resource that can be allocated by 'p' before
355  * hitting 'rule'.
356  */
357 static int64_t
358 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
359 {
360 	const struct racct *racct;
361 	int64_t available;
362 
363 	ASSERT_RACCT_ENABLED();
364 	RACCT_LOCK_ASSERT();
365 
366 	racct = rctl_proc_rule_to_racct(p, rule);
367 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
368 
369 	return (available);
370 }
371 
372 /*
373  * Called every second for proc, uidinfo, loginclass, and jail containers.
374  * If the limit isn't exceeded, it decreases the usage amount to zero.
375  * Otherwise, it decreases it by the value of the limit.  This way
376  * resource consumption exceeding the limit "carries over" to the next
377  * period.
378  */
379 void
380 rctl_throttle_decay(struct racct *racct, int resource)
381 {
382 	struct rctl_rule *rule;
383 	struct rctl_rule_link *link;
384 	int64_t minavailable;
385 
386 	ASSERT_RACCT_ENABLED();
387 	RACCT_LOCK_ASSERT();
388 
389 	minavailable = INT64_MAX;
390 
391 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
392 		rule = link->rrl_rule;
393 
394 		if (rule->rr_resource != resource)
395 			continue;
396 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
397 			continue;
398 
399 		if (rule->rr_amount < minavailable)
400 			minavailable = rule->rr_amount;
401 	}
402 
403 	if (racct->r_resources[resource] < minavailable) {
404 		racct->r_resources[resource] = 0;
405 	} else {
406 		/*
407 		 * Cap utilization counter at ten times the limit.  Otherwise,
408 		 * if we changed the rule lowering the allowed amount, it could
409 		 * take unreasonably long time for the accumulated resource
410 		 * usage to drop.
411 		 */
412 		if (racct->r_resources[resource] > minavailable * 10)
413 			racct->r_resources[resource] = minavailable * 10;
414 
415 		racct->r_resources[resource] -= minavailable;
416 	}
417 }
418 
419 /*
420  * Special version of rctl_get_available() for the %CPU resource.
421  * We slightly cheat here and return less than we normally would.
422  */
423 int64_t
424 rctl_pcpu_available(const struct proc *p) {
425 	struct rctl_rule *rule;
426 	struct rctl_rule_link *link;
427 	int64_t available, minavailable, limit;
428 
429 	ASSERT_RACCT_ENABLED();
430 	RACCT_LOCK_ASSERT();
431 
432 	minavailable = INT64_MAX;
433 	limit = 0;
434 
435 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
436 		rule = link->rrl_rule;
437 		if (rule->rr_resource != RACCT_PCTCPU)
438 			continue;
439 		if (rule->rr_action != RCTL_ACTION_DENY)
440 			continue;
441 		available = rctl_available_resource(p, rule);
442 		if (available < minavailable) {
443 			minavailable = available;
444 			limit = rule->rr_amount;
445 		}
446 	}
447 
448 	/*
449 	 * Return slightly less than actual value of the available
450 	 * %cpu resource.  This makes %cpu throttling more aggressive
451 	 * and lets us act sooner than the limits are already exceeded.
452 	 */
453 	if (limit != 0) {
454 		if (limit > 2 * RCTL_PCPU_SHIFT)
455 			minavailable -= RCTL_PCPU_SHIFT;
456 		else
457 			minavailable -= (limit / 2);
458 	}
459 
460 	return (minavailable);
461 }
462 
463 static uint64_t
464 xadd(uint64_t a, uint64_t b)
465 {
466 	uint64_t c;
467 
468 	c = a + b;
469 
470 	/*
471 	 * Detect overflow.
472 	 */
473 	if (c < a || c < b)
474 		return (UINT64_MAX);
475 
476 	return (c);
477 }
478 
479 static uint64_t
480 xmul(uint64_t a, uint64_t b)
481 {
482 
483 	if (b != 0 && a > UINT64_MAX / b)
484 		return (UINT64_MAX);
485 
486 	return (a * b);
487 }
488 
489 /*
490  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
491  * to what it keeps allocated now.  Returns non-zero if the allocation should
492  * be denied, 0 otherwise.
493  */
494 int
495 rctl_enforce(struct proc *p, int resource, uint64_t amount)
496 {
497 	static struct timeval log_lasttime, devctl_lasttime;
498 	static int log_curtime = 0, devctl_curtime = 0;
499 	struct rctl_rule *rule;
500 	struct rctl_rule_link *link;
501 	struct sbuf sb;
502 	char *buf;
503 	int64_t available;
504 	uint64_t sleep_ms, sleep_ratio;
505 	int should_deny = 0;
506 
507 	ASSERT_RACCT_ENABLED();
508 	RACCT_LOCK_ASSERT();
509 
510 	/*
511 	 * There may be more than one matching rule; go through all of them.
512 	 * Denial should be done last, after logging and sending signals.
513 	 */
514 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
515 		rule = link->rrl_rule;
516 		if (rule->rr_resource != resource)
517 			continue;
518 
519 		available = rctl_available_resource(p, rule);
520 		if (available >= (int64_t)amount) {
521 			link->rrl_exceeded = 0;
522 			continue;
523 		}
524 
525 		switch (rule->rr_action) {
526 		case RCTL_ACTION_DENY:
527 			should_deny = 1;
528 			continue;
529 		case RCTL_ACTION_LOG:
530 			/*
531 			 * If rrl_exceeded != 0, it means we've already
532 			 * logged a warning for this process.
533 			 */
534 			if (link->rrl_exceeded != 0)
535 				continue;
536 
537 			/*
538 			 * If the process state is not fully initialized yet,
539 			 * we can't access most of the required fields, e.g.
540 			 * p->p_comm.  This happens when called from fork1().
541 			 * Ignore this rule for now; it will be processed just
542 			 * after fork, when called from racct_proc_fork_done().
543 			 */
544 			if (p->p_state != PRS_NORMAL)
545 				continue;
546 
547 			if (!ppsratecheck(&log_lasttime, &log_curtime,
548 			    rctl_log_rate_limit))
549 				continue;
550 
551 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
552 			if (buf == NULL) {
553 				printf("rctl_enforce: out of memory\n");
554 				continue;
555 			}
556 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
557 			rctl_rule_to_sbuf(&sb, rule);
558 			sbuf_finish(&sb);
559 			printf("rctl: rule \"%s\" matched by pid %d "
560 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
561 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
562 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
563 			sbuf_delete(&sb);
564 			free(buf, M_RCTL);
565 			link->rrl_exceeded = 1;
566 			continue;
567 		case RCTL_ACTION_DEVCTL:
568 			if (link->rrl_exceeded != 0)
569 				continue;
570 
571 			if (p->p_state != PRS_NORMAL)
572 				continue;
573 
574 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
575 			    rctl_devctl_rate_limit))
576 				continue;
577 
578 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
579 			if (buf == NULL) {
580 				printf("rctl_enforce: out of memory\n");
581 				continue;
582 			}
583 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
584 			sbuf_printf(&sb, "rule=");
585 			rctl_rule_to_sbuf(&sb, rule);
586 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
587 			    p->p_pid, p->p_ucred->cr_ruid,
588 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
589 			sbuf_finish(&sb);
590 			devctl_notify("RCTL", "rule", "matched",
591 			    sbuf_data(&sb));
592 			sbuf_delete(&sb);
593 			free(buf, M_RCTL);
594 			link->rrl_exceeded = 1;
595 			continue;
596 		case RCTL_ACTION_THROTTLE:
597 			if (p->p_state != PRS_NORMAL)
598 				continue;
599 
600 			if (rule->rr_amount == 0) {
601 				racct_proc_throttle(p, rctl_throttle_max);
602 				continue;
603 			}
604 
605 			/*
606 			 * Make the process sleep for a fraction of second
607 			 * proportional to the ratio of process' resource
608 			 * utilization compared to the limit.  The point is
609 			 * to penalize resource hogs: processes that consume
610 			 * more of the available resources sleep for longer.
611 			 *
612 			 * We're trying to defer division until the very end,
613 			 * to minimize the rounding effects.  The following
614 			 * calculation could have been written in a clearer
615 			 * way like this:
616 			 *
617 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
618 			 *     rule->rr_amount;
619 			 * sleep_ms *= rctl_throttle_pct / 100;
620 			 * if (sleep_ms < rctl_throttle_min)
621 			 *         sleep_ms = rctl_throttle_min;
622 			 *
623 			 */
624 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
625 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
626 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
627 				sleep_ms = rctl_throttle_min * rule->rr_amount;
628 
629 			/*
630 			 * Multiply that by the ratio of the resource
631 			 * consumption for the container compared to the limit,
632 			 * squared.  In other words, a process in a container
633 			 * that is two times over the limit will be throttled
634 			 * four times as much for hitting the same rule.  The
635 			 * point is to penalize processes more if the container
636 			 * itself (eg certain UID or jail) is above the limit.
637 			 */
638 			if (available < 0)
639 				sleep_ratio = -available / rule->rr_amount;
640 			else
641 				sleep_ratio = 0;
642 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
643 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
644 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
645 
646 			/*
647 			 * Finally the division.
648 			 */
649 			sleep_ms /= rule->rr_amount;
650 
651 			if (sleep_ms > rctl_throttle_max)
652 				sleep_ms = rctl_throttle_max;
653 #if 0
654 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
655 			   __func__, p->p_pid, p->p_comm,
656 			   p->p_racct->r_resources[resource],
657 			   rule->rr_amount, (uintmax_t)sleep_ms,
658 			   (uintmax_t)sleep_ratio, (intmax_t)available);
659 #endif
660 
661 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
662 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
663 			racct_proc_throttle(p, sleep_ms);
664 			continue;
665 		default:
666 			if (link->rrl_exceeded != 0)
667 				continue;
668 
669 			if (p->p_state != PRS_NORMAL)
670 				continue;
671 
672 			KASSERT(rule->rr_action > 0 &&
673 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
674 			    ("rctl_enforce: unknown action %d",
675 			     rule->rr_action));
676 
677 			/*
678 			 * We're using the fact that RCTL_ACTION_SIG* values
679 			 * are equal to their counterparts from sys/signal.h.
680 			 */
681 			kern_psignal(p, rule->rr_action);
682 			link->rrl_exceeded = 1;
683 			continue;
684 		}
685 	}
686 
687 	if (should_deny) {
688 		/*
689 		 * Return fake error code; the caller should change it
690 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
691 		 */
692 		return (EDOOFUS);
693 	}
694 
695 	return (0);
696 }
697 
698 uint64_t
699 rctl_get_limit(struct proc *p, int resource)
700 {
701 	struct rctl_rule *rule;
702 	struct rctl_rule_link *link;
703 	uint64_t amount = UINT64_MAX;
704 
705 	ASSERT_RACCT_ENABLED();
706 	RACCT_LOCK_ASSERT();
707 
708 	/*
709 	 * There may be more than one matching rule; go through all of them.
710 	 * Denial should be done last, after logging and sending signals.
711 	 */
712 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
713 		rule = link->rrl_rule;
714 		if (rule->rr_resource != resource)
715 			continue;
716 		if (rule->rr_action != RCTL_ACTION_DENY)
717 			continue;
718 		if (rule->rr_amount < amount)
719 			amount = rule->rr_amount;
720 	}
721 
722 	return (amount);
723 }
724 
725 uint64_t
726 rctl_get_available(struct proc *p, int resource)
727 {
728 	struct rctl_rule *rule;
729 	struct rctl_rule_link *link;
730 	int64_t available, minavailable, allocated;
731 
732 	minavailable = INT64_MAX;
733 
734 	ASSERT_RACCT_ENABLED();
735 	RACCT_LOCK_ASSERT();
736 
737 	/*
738 	 * There may be more than one matching rule; go through all of them.
739 	 * Denial should be done last, after logging and sending signals.
740 	 */
741 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
742 		rule = link->rrl_rule;
743 		if (rule->rr_resource != resource)
744 			continue;
745 		if (rule->rr_action != RCTL_ACTION_DENY)
746 			continue;
747 		available = rctl_available_resource(p, rule);
748 		if (available < minavailable)
749 			minavailable = available;
750 	}
751 
752 	/*
753 	 * XXX: Think about this _hard_.
754 	 */
755 	allocated = p->p_racct->r_resources[resource];
756 	if (minavailable < INT64_MAX - allocated)
757 		minavailable += allocated;
758 	if (minavailable < 0)
759 		minavailable = 0;
760 
761 	return (minavailable);
762 }
763 
764 static int
765 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
766 {
767 
768 	ASSERT_RACCT_ENABLED();
769 
770 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
771 		if (rule->rr_subject_type != filter->rr_subject_type)
772 			return (0);
773 
774 		switch (filter->rr_subject_type) {
775 		case RCTL_SUBJECT_TYPE_PROCESS:
776 			if (filter->rr_subject.rs_proc != NULL &&
777 			    rule->rr_subject.rs_proc !=
778 			    filter->rr_subject.rs_proc)
779 				return (0);
780 			break;
781 		case RCTL_SUBJECT_TYPE_USER:
782 			if (filter->rr_subject.rs_uip != NULL &&
783 			    rule->rr_subject.rs_uip !=
784 			    filter->rr_subject.rs_uip)
785 				return (0);
786 			break;
787 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
788 			if (filter->rr_subject.rs_loginclass != NULL &&
789 			    rule->rr_subject.rs_loginclass !=
790 			    filter->rr_subject.rs_loginclass)
791 				return (0);
792 			break;
793 		case RCTL_SUBJECT_TYPE_JAIL:
794 			if (filter->rr_subject.rs_prison_racct != NULL &&
795 			    rule->rr_subject.rs_prison_racct !=
796 			    filter->rr_subject.rs_prison_racct)
797 				return (0);
798 			break;
799 		default:
800 			panic("rctl_rule_matches: unknown subject type %d",
801 			    filter->rr_subject_type);
802 		}
803 	}
804 
805 	if (filter->rr_resource != RACCT_UNDEFINED) {
806 		if (rule->rr_resource != filter->rr_resource)
807 			return (0);
808 	}
809 
810 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
811 		if (rule->rr_action != filter->rr_action)
812 			return (0);
813 	}
814 
815 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
816 		if (rule->rr_amount != filter->rr_amount)
817 			return (0);
818 	}
819 
820 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
821 		if (rule->rr_per != filter->rr_per)
822 			return (0);
823 	}
824 
825 	return (1);
826 }
827 
828 static int
829 str2value(const char *str, int *value, struct dict *table)
830 {
831 	int i;
832 
833 	if (value == NULL)
834 		return (EINVAL);
835 
836 	for (i = 0; table[i].d_name != NULL; i++) {
837 		if (strcasecmp(table[i].d_name, str) == 0) {
838 			*value =  table[i].d_value;
839 			return (0);
840 		}
841 	}
842 
843 	return (EINVAL);
844 }
845 
846 static int
847 str2id(const char *str, id_t *value)
848 {
849 	char *end;
850 
851 	if (str == NULL)
852 		return (EINVAL);
853 
854 	*value = strtoul(str, &end, 10);
855 	if ((size_t)(end - str) != strlen(str))
856 		return (EINVAL);
857 
858 	return (0);
859 }
860 
861 static int
862 str2int64(const char *str, int64_t *value)
863 {
864 	char *end;
865 
866 	if (str == NULL)
867 		return (EINVAL);
868 
869 	*value = strtoul(str, &end, 10);
870 	if ((size_t)(end - str) != strlen(str))
871 		return (EINVAL);
872 
873 	if (*value < 0)
874 		return (ERANGE);
875 
876 	return (0);
877 }
878 
879 /*
880  * Connect the rule to the racct, increasing refcount for the rule.
881  */
882 static void
883 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
884 {
885 	struct rctl_rule_link *link;
886 
887 	ASSERT_RACCT_ENABLED();
888 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
889 
890 	rctl_rule_acquire(rule);
891 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
892 	link->rrl_rule = rule;
893 	link->rrl_exceeded = 0;
894 
895 	RACCT_LOCK();
896 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
897 	RACCT_UNLOCK();
898 }
899 
900 static int
901 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
902 {
903 	struct rctl_rule_link *link;
904 
905 	ASSERT_RACCT_ENABLED();
906 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
907 	RACCT_LOCK_ASSERT();
908 
909 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
910 	if (link == NULL)
911 		return (ENOMEM);
912 	rctl_rule_acquire(rule);
913 	link->rrl_rule = rule;
914 	link->rrl_exceeded = 0;
915 
916 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
917 
918 	return (0);
919 }
920 
921 /*
922  * Remove limits for a rules matching the filter and release
923  * the refcounts for the rules, possibly freeing them.  Returns
924  * the number of limit structures removed.
925  */
926 static int
927 rctl_racct_remove_rules(struct racct *racct,
928     const struct rctl_rule *filter)
929 {
930 	struct rctl_rule_link *link, *linktmp;
931 	int removed = 0;
932 
933 	ASSERT_RACCT_ENABLED();
934 	RACCT_LOCK_ASSERT();
935 
936 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
937 		if (!rctl_rule_matches(link->rrl_rule, filter))
938 			continue;
939 
940 		LIST_REMOVE(link, rrl_next);
941 		rctl_rule_release(link->rrl_rule);
942 		uma_zfree(rctl_rule_link_zone, link);
943 		removed++;
944 	}
945 	return (removed);
946 }
947 
948 static void
949 rctl_rule_acquire_subject(struct rctl_rule *rule)
950 {
951 
952 	ASSERT_RACCT_ENABLED();
953 
954 	switch (rule->rr_subject_type) {
955 	case RCTL_SUBJECT_TYPE_UNDEFINED:
956 	case RCTL_SUBJECT_TYPE_PROCESS:
957 		break;
958 	case RCTL_SUBJECT_TYPE_JAIL:
959 		if (rule->rr_subject.rs_prison_racct != NULL)
960 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
961 		break;
962 	case RCTL_SUBJECT_TYPE_USER:
963 		if (rule->rr_subject.rs_uip != NULL)
964 			uihold(rule->rr_subject.rs_uip);
965 		break;
966 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
967 		if (rule->rr_subject.rs_loginclass != NULL)
968 			loginclass_hold(rule->rr_subject.rs_loginclass);
969 		break;
970 	default:
971 		panic("rctl_rule_acquire_subject: unknown subject type %d",
972 		    rule->rr_subject_type);
973 	}
974 }
975 
976 static void
977 rctl_rule_release_subject(struct rctl_rule *rule)
978 {
979 
980 	ASSERT_RACCT_ENABLED();
981 
982 	switch (rule->rr_subject_type) {
983 	case RCTL_SUBJECT_TYPE_UNDEFINED:
984 	case RCTL_SUBJECT_TYPE_PROCESS:
985 		break;
986 	case RCTL_SUBJECT_TYPE_JAIL:
987 		if (rule->rr_subject.rs_prison_racct != NULL)
988 			prison_racct_free(rule->rr_subject.rs_prison_racct);
989 		break;
990 	case RCTL_SUBJECT_TYPE_USER:
991 		if (rule->rr_subject.rs_uip != NULL)
992 			uifree(rule->rr_subject.rs_uip);
993 		break;
994 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
995 		if (rule->rr_subject.rs_loginclass != NULL)
996 			loginclass_free(rule->rr_subject.rs_loginclass);
997 		break;
998 	default:
999 		panic("rctl_rule_release_subject: unknown subject type %d",
1000 		    rule->rr_subject_type);
1001 	}
1002 }
1003 
1004 struct rctl_rule *
1005 rctl_rule_alloc(int flags)
1006 {
1007 	struct rctl_rule *rule;
1008 
1009 	ASSERT_RACCT_ENABLED();
1010 
1011 	rule = uma_zalloc(rctl_rule_zone, flags);
1012 	if (rule == NULL)
1013 		return (NULL);
1014 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1015 	rule->rr_subject.rs_proc = NULL;
1016 	rule->rr_subject.rs_uip = NULL;
1017 	rule->rr_subject.rs_loginclass = NULL;
1018 	rule->rr_subject.rs_prison_racct = NULL;
1019 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1020 	rule->rr_resource = RACCT_UNDEFINED;
1021 	rule->rr_action = RCTL_ACTION_UNDEFINED;
1022 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1023 	refcount_init(&rule->rr_refcount, 1);
1024 
1025 	return (rule);
1026 }
1027 
1028 struct rctl_rule *
1029 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1030 {
1031 	struct rctl_rule *copy;
1032 
1033 	ASSERT_RACCT_ENABLED();
1034 
1035 	copy = uma_zalloc(rctl_rule_zone, flags);
1036 	if (copy == NULL)
1037 		return (NULL);
1038 	copy->rr_subject_type = rule->rr_subject_type;
1039 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1040 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1041 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1042 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1043 	copy->rr_per = rule->rr_per;
1044 	copy->rr_resource = rule->rr_resource;
1045 	copy->rr_action = rule->rr_action;
1046 	copy->rr_amount = rule->rr_amount;
1047 	refcount_init(&copy->rr_refcount, 1);
1048 	rctl_rule_acquire_subject(copy);
1049 
1050 	return (copy);
1051 }
1052 
1053 void
1054 rctl_rule_acquire(struct rctl_rule *rule)
1055 {
1056 
1057 	ASSERT_RACCT_ENABLED();
1058 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1059 
1060 	refcount_acquire(&rule->rr_refcount);
1061 }
1062 
1063 static void
1064 rctl_rule_free(void *context, int pending)
1065 {
1066 	struct rctl_rule *rule;
1067 
1068 	rule = (struct rctl_rule *)context;
1069 
1070 	ASSERT_RACCT_ENABLED();
1071 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1072 
1073 	/*
1074 	 * We don't need locking here; rule is guaranteed to be inaccessible.
1075 	 */
1076 
1077 	rctl_rule_release_subject(rule);
1078 	uma_zfree(rctl_rule_zone, rule);
1079 }
1080 
1081 void
1082 rctl_rule_release(struct rctl_rule *rule)
1083 {
1084 
1085 	ASSERT_RACCT_ENABLED();
1086 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1087 
1088 	if (refcount_release(&rule->rr_refcount)) {
1089 		/*
1090 		 * rctl_rule_release() is often called when iterating
1091 		 * over all the uidinfo structures in the system,
1092 		 * holding uihashtbl_lock.  Since rctl_rule_free()
1093 		 * might end up calling uifree(), this would lead
1094 		 * to lock recursion.  Use taskqueue to avoid this.
1095 		 */
1096 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1097 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1098 	}
1099 }
1100 
1101 static int
1102 rctl_rule_fully_specified(const struct rctl_rule *rule)
1103 {
1104 
1105 	ASSERT_RACCT_ENABLED();
1106 
1107 	switch (rule->rr_subject_type) {
1108 	case RCTL_SUBJECT_TYPE_UNDEFINED:
1109 		return (0);
1110 	case RCTL_SUBJECT_TYPE_PROCESS:
1111 		if (rule->rr_subject.rs_proc == NULL)
1112 			return (0);
1113 		break;
1114 	case RCTL_SUBJECT_TYPE_USER:
1115 		if (rule->rr_subject.rs_uip == NULL)
1116 			return (0);
1117 		break;
1118 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1119 		if (rule->rr_subject.rs_loginclass == NULL)
1120 			return (0);
1121 		break;
1122 	case RCTL_SUBJECT_TYPE_JAIL:
1123 		if (rule->rr_subject.rs_prison_racct == NULL)
1124 			return (0);
1125 		break;
1126 	default:
1127 		panic("rctl_rule_fully_specified: unknown subject type %d",
1128 		    rule->rr_subject_type);
1129 	}
1130 	if (rule->rr_resource == RACCT_UNDEFINED)
1131 		return (0);
1132 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1133 		return (0);
1134 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1135 		return (0);
1136 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1137 		return (0);
1138 
1139 	return (1);
1140 }
1141 
1142 static int
1143 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1144 {
1145 	struct rctl_rule *rule;
1146 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1147 	     *amountstr, *perstr;
1148 	id_t id;
1149 	int error = 0;
1150 
1151 	ASSERT_RACCT_ENABLED();
1152 
1153 	rule = rctl_rule_alloc(M_WAITOK);
1154 
1155 	subjectstr = strsep(&rulestr, ":");
1156 	subject_idstr = strsep(&rulestr, ":");
1157 	resourcestr = strsep(&rulestr, ":");
1158 	actionstr = strsep(&rulestr, "=/");
1159 	amountstr = strsep(&rulestr, "/");
1160 	perstr = rulestr;
1161 
1162 	if (subjectstr == NULL || subjectstr[0] == '\0')
1163 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1164 	else {
1165 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1166 		if (error != 0)
1167 			goto out;
1168 	}
1169 
1170 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1171 		rule->rr_subject.rs_proc = NULL;
1172 		rule->rr_subject.rs_uip = NULL;
1173 		rule->rr_subject.rs_loginclass = NULL;
1174 		rule->rr_subject.rs_prison_racct = NULL;
1175 	} else {
1176 		switch (rule->rr_subject_type) {
1177 		case RCTL_SUBJECT_TYPE_UNDEFINED:
1178 			error = EINVAL;
1179 			goto out;
1180 		case RCTL_SUBJECT_TYPE_PROCESS:
1181 			error = str2id(subject_idstr, &id);
1182 			if (error != 0)
1183 				goto out;
1184 			sx_assert(&allproc_lock, SA_LOCKED);
1185 			rule->rr_subject.rs_proc = pfind(id);
1186 			if (rule->rr_subject.rs_proc == NULL) {
1187 				error = ESRCH;
1188 				goto out;
1189 			}
1190 			PROC_UNLOCK(rule->rr_subject.rs_proc);
1191 			break;
1192 		case RCTL_SUBJECT_TYPE_USER:
1193 			error = str2id(subject_idstr, &id);
1194 			if (error != 0)
1195 				goto out;
1196 			rule->rr_subject.rs_uip = uifind(id);
1197 			break;
1198 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1199 			rule->rr_subject.rs_loginclass =
1200 			    loginclass_find(subject_idstr);
1201 			if (rule->rr_subject.rs_loginclass == NULL) {
1202 				error = ENAMETOOLONG;
1203 				goto out;
1204 			}
1205 			break;
1206 		case RCTL_SUBJECT_TYPE_JAIL:
1207 			rule->rr_subject.rs_prison_racct =
1208 			    prison_racct_find(subject_idstr);
1209 			if (rule->rr_subject.rs_prison_racct == NULL) {
1210 				error = ENAMETOOLONG;
1211 				goto out;
1212 			}
1213 			break;
1214                default:
1215                        panic("rctl_string_to_rule: unknown subject type %d",
1216                            rule->rr_subject_type);
1217                }
1218 	}
1219 
1220 	if (resourcestr == NULL || resourcestr[0] == '\0')
1221 		rule->rr_resource = RACCT_UNDEFINED;
1222 	else {
1223 		error = str2value(resourcestr, &rule->rr_resource,
1224 		    resourcenames);
1225 		if (error != 0)
1226 			goto out;
1227 	}
1228 
1229 	if (actionstr == NULL || actionstr[0] == '\0')
1230 		rule->rr_action = RCTL_ACTION_UNDEFINED;
1231 	else {
1232 		error = str2value(actionstr, &rule->rr_action, actionnames);
1233 		if (error != 0)
1234 			goto out;
1235 	}
1236 
1237 	if (amountstr == NULL || amountstr[0] == '\0')
1238 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1239 	else {
1240 		error = str2int64(amountstr, &rule->rr_amount);
1241 		if (error != 0)
1242 			goto out;
1243 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1244 			if (rule->rr_amount > INT64_MAX / 1000000) {
1245 				error = ERANGE;
1246 				goto out;
1247 			}
1248 			rule->rr_amount *= 1000000;
1249 		}
1250 	}
1251 
1252 	if (perstr == NULL || perstr[0] == '\0')
1253 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1254 	else {
1255 		error = str2value(perstr, &rule->rr_per, subjectnames);
1256 		if (error != 0)
1257 			goto out;
1258 	}
1259 
1260 out:
1261 	if (error == 0)
1262 		*rulep = rule;
1263 	else
1264 		rctl_rule_release(rule);
1265 
1266 	return (error);
1267 }
1268 
1269 /*
1270  * Link a rule with all the subjects it applies to.
1271  */
1272 int
1273 rctl_rule_add(struct rctl_rule *rule)
1274 {
1275 	struct proc *p;
1276 	struct ucred *cred;
1277 	struct uidinfo *uip;
1278 	struct prison *pr;
1279 	struct prison_racct *prr;
1280 	struct loginclass *lc;
1281 	struct rctl_rule *rule2;
1282 	int match;
1283 
1284 	ASSERT_RACCT_ENABLED();
1285 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1286 
1287 	/*
1288 	 * Some rules just don't make sense, like "deny" rule for an undeniable
1289 	 * resource.  The exception are the RSS and %CPU resources - they are
1290 	 * not deniable in the racct sense, but the limit is enforced in
1291 	 * a different way.
1292 	 */
1293 	if (rule->rr_action == RCTL_ACTION_DENY &&
1294 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1295 	    rule->rr_resource != RACCT_RSS &&
1296 	    rule->rr_resource != RACCT_PCTCPU) {
1297 		return (EOPNOTSUPP);
1298 	}
1299 
1300 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1301 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1302 		return (EOPNOTSUPP);
1303 	}
1304 
1305 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1306 	    rule->rr_resource == RACCT_PCTCPU) {
1307 		return (EOPNOTSUPP);
1308 	}
1309 
1310 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1311 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1312 		return (EOPNOTSUPP);
1313 	}
1314 
1315 	/*
1316 	 * Make sure there are no duplicated rules.  Also, for the "deny"
1317 	 * rules, remove ones differing only by "amount".
1318 	 */
1319 	if (rule->rr_action == RCTL_ACTION_DENY) {
1320 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1321 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1322 		rctl_rule_remove(rule2);
1323 		rctl_rule_release(rule2);
1324 	} else
1325 		rctl_rule_remove(rule);
1326 
1327 	switch (rule->rr_subject_type) {
1328 	case RCTL_SUBJECT_TYPE_PROCESS:
1329 		p = rule->rr_subject.rs_proc;
1330 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1331 
1332 		rctl_racct_add_rule(p->p_racct, rule);
1333 		/*
1334 		 * In case of per-process rule, we don't have anything more
1335 		 * to do.
1336 		 */
1337 		return (0);
1338 
1339 	case RCTL_SUBJECT_TYPE_USER:
1340 		uip = rule->rr_subject.rs_uip;
1341 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1342 		rctl_racct_add_rule(uip->ui_racct, rule);
1343 		break;
1344 
1345 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1346 		lc = rule->rr_subject.rs_loginclass;
1347 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1348 		rctl_racct_add_rule(lc->lc_racct, rule);
1349 		break;
1350 
1351 	case RCTL_SUBJECT_TYPE_JAIL:
1352 		prr = rule->rr_subject.rs_prison_racct;
1353 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1354 		rctl_racct_add_rule(prr->prr_racct, rule);
1355 		break;
1356 
1357 	default:
1358 		panic("rctl_rule_add: unknown subject type %d",
1359 		    rule->rr_subject_type);
1360 	}
1361 
1362 	/*
1363 	 * Now go through all the processes and add the new rule to the ones
1364 	 * it applies to.
1365 	 */
1366 	sx_assert(&allproc_lock, SA_LOCKED);
1367 	FOREACH_PROC_IN_SYSTEM(p) {
1368 		cred = p->p_ucred;
1369 		switch (rule->rr_subject_type) {
1370 		case RCTL_SUBJECT_TYPE_USER:
1371 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1372 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1373 				break;
1374 			continue;
1375 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1376 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1377 				break;
1378 			continue;
1379 		case RCTL_SUBJECT_TYPE_JAIL:
1380 			match = 0;
1381 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1382 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1383 					match = 1;
1384 					break;
1385 				}
1386 			}
1387 			if (match)
1388 				break;
1389 			continue;
1390 		default:
1391 			panic("rctl_rule_add: unknown subject type %d",
1392 			    rule->rr_subject_type);
1393 		}
1394 
1395 		rctl_racct_add_rule(p->p_racct, rule);
1396 	}
1397 
1398 	return (0);
1399 }
1400 
1401 static void
1402 rctl_rule_pre_callback(void)
1403 {
1404 
1405 	RACCT_LOCK();
1406 }
1407 
1408 static void
1409 rctl_rule_post_callback(void)
1410 {
1411 
1412 	RACCT_UNLOCK();
1413 }
1414 
1415 static void
1416 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1417 {
1418 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1419 	int found = 0;
1420 
1421 	ASSERT_RACCT_ENABLED();
1422 	RACCT_LOCK_ASSERT();
1423 
1424 	found += rctl_racct_remove_rules(racct, filter);
1425 
1426 	*((int *)arg3) += found;
1427 }
1428 
1429 /*
1430  * Remove all rules that match the filter.
1431  */
1432 int
1433 rctl_rule_remove(struct rctl_rule *filter)
1434 {
1435 	struct proc *p;
1436 	int found = 0;
1437 
1438 	ASSERT_RACCT_ENABLED();
1439 
1440 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1441 	    filter->rr_subject.rs_proc != NULL) {
1442 		p = filter->rr_subject.rs_proc;
1443 		RACCT_LOCK();
1444 		found = rctl_racct_remove_rules(p->p_racct, filter);
1445 		RACCT_UNLOCK();
1446 		if (found)
1447 			return (0);
1448 		return (ESRCH);
1449 	}
1450 
1451 	loginclass_racct_foreach(rctl_rule_remove_callback,
1452 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1453 	    filter, (void *)&found);
1454 	ui_racct_foreach(rctl_rule_remove_callback,
1455 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1456 	    filter, (void *)&found);
1457 	prison_racct_foreach(rctl_rule_remove_callback,
1458 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1459 	    filter, (void *)&found);
1460 
1461 	sx_assert(&allproc_lock, SA_LOCKED);
1462 	RACCT_LOCK();
1463 	FOREACH_PROC_IN_SYSTEM(p) {
1464 		found += rctl_racct_remove_rules(p->p_racct, filter);
1465 	}
1466 	RACCT_UNLOCK();
1467 
1468 	if (found)
1469 		return (0);
1470 	return (ESRCH);
1471 }
1472 
1473 /*
1474  * Appends a rule to the sbuf.
1475  */
1476 static void
1477 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1478 {
1479 	int64_t amount;
1480 
1481 	ASSERT_RACCT_ENABLED();
1482 
1483 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1484 
1485 	switch (rule->rr_subject_type) {
1486 	case RCTL_SUBJECT_TYPE_PROCESS:
1487 		if (rule->rr_subject.rs_proc == NULL)
1488 			sbuf_printf(sb, ":");
1489 		else
1490 			sbuf_printf(sb, "%d:",
1491 			    rule->rr_subject.rs_proc->p_pid);
1492 		break;
1493 	case RCTL_SUBJECT_TYPE_USER:
1494 		if (rule->rr_subject.rs_uip == NULL)
1495 			sbuf_printf(sb, ":");
1496 		else
1497 			sbuf_printf(sb, "%d:",
1498 			    rule->rr_subject.rs_uip->ui_uid);
1499 		break;
1500 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1501 		if (rule->rr_subject.rs_loginclass == NULL)
1502 			sbuf_printf(sb, ":");
1503 		else
1504 			sbuf_printf(sb, "%s:",
1505 			    rule->rr_subject.rs_loginclass->lc_name);
1506 		break;
1507 	case RCTL_SUBJECT_TYPE_JAIL:
1508 		if (rule->rr_subject.rs_prison_racct == NULL)
1509 			sbuf_printf(sb, ":");
1510 		else
1511 			sbuf_printf(sb, "%s:",
1512 			    rule->rr_subject.rs_prison_racct->prr_name);
1513 		break;
1514 	default:
1515 		panic("rctl_rule_to_sbuf: unknown subject type %d",
1516 		    rule->rr_subject_type);
1517 	}
1518 
1519 	amount = rule->rr_amount;
1520 	if (amount != RCTL_AMOUNT_UNDEFINED &&
1521 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1522 		amount /= 1000000;
1523 
1524 	sbuf_printf(sb, "%s:%s=%jd",
1525 	    rctl_resource_name(rule->rr_resource),
1526 	    rctl_action_name(rule->rr_action),
1527 	    amount);
1528 
1529 	if (rule->rr_per != rule->rr_subject_type)
1530 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1531 }
1532 
1533 /*
1534  * Routine used by RCTL syscalls to read in input string.
1535  */
1536 static int
1537 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1538 {
1539 	char *str;
1540 	int error;
1541 
1542 	ASSERT_RACCT_ENABLED();
1543 
1544 	if (inbuflen <= 0)
1545 		return (EINVAL);
1546 	if (inbuflen > RCTL_MAX_INBUFSIZE)
1547 		return (E2BIG);
1548 
1549 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1550 	error = copyinstr(inbufp, str, inbuflen, NULL);
1551 	if (error != 0) {
1552 		free(str, M_RCTL);
1553 		return (error);
1554 	}
1555 
1556 	*inputstr = str;
1557 
1558 	return (0);
1559 }
1560 
1561 /*
1562  * Routine used by RCTL syscalls to write out output string.
1563  */
1564 static int
1565 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1566 {
1567 	int error;
1568 
1569 	ASSERT_RACCT_ENABLED();
1570 
1571 	if (outputsbuf == NULL)
1572 		return (0);
1573 
1574 	sbuf_finish(outputsbuf);
1575 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1576 		sbuf_delete(outputsbuf);
1577 		return (ERANGE);
1578 	}
1579 	error = copyout(sbuf_data(outputsbuf), outbufp,
1580 	    sbuf_len(outputsbuf) + 1);
1581 	sbuf_delete(outputsbuf);
1582 	return (error);
1583 }
1584 
1585 static struct sbuf *
1586 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1587 {
1588 	struct sbuf *sb;
1589 	int64_t amount;
1590 	int i;
1591 
1592 	ASSERT_RACCT_ENABLED();
1593 
1594 	sb = sbuf_new_auto();
1595 	for (i = 0; i <= RACCT_MAX; i++) {
1596 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1597 			continue;
1598 		RACCT_LOCK();
1599 		amount = racct->r_resources[i];
1600 		RACCT_UNLOCK();
1601 		if (RACCT_IS_IN_MILLIONS(i))
1602 			amount /= 1000000;
1603 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1604 	}
1605 	sbuf_setpos(sb, sbuf_len(sb) - 1);
1606 	return (sb);
1607 }
1608 
1609 int
1610 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1611 {
1612 	struct rctl_rule *filter;
1613 	struct sbuf *outputsbuf = NULL;
1614 	struct proc *p;
1615 	struct uidinfo *uip;
1616 	struct loginclass *lc;
1617 	struct prison_racct *prr;
1618 	char *inputstr;
1619 	int error;
1620 
1621 	if (!racct_enable)
1622 		return (ENOSYS);
1623 
1624 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1625 	if (error != 0)
1626 		return (error);
1627 
1628 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1629 	if (error != 0)
1630 		return (error);
1631 
1632 	sx_slock(&allproc_lock);
1633 	error = rctl_string_to_rule(inputstr, &filter);
1634 	free(inputstr, M_RCTL);
1635 	if (error != 0) {
1636 		sx_sunlock(&allproc_lock);
1637 		return (error);
1638 	}
1639 
1640 	switch (filter->rr_subject_type) {
1641 	case RCTL_SUBJECT_TYPE_PROCESS:
1642 		p = filter->rr_subject.rs_proc;
1643 		if (p == NULL) {
1644 			error = EINVAL;
1645 			goto out;
1646 		}
1647 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1648 		break;
1649 	case RCTL_SUBJECT_TYPE_USER:
1650 		uip = filter->rr_subject.rs_uip;
1651 		if (uip == NULL) {
1652 			error = EINVAL;
1653 			goto out;
1654 		}
1655 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1656 		break;
1657 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1658 		lc = filter->rr_subject.rs_loginclass;
1659 		if (lc == NULL) {
1660 			error = EINVAL;
1661 			goto out;
1662 		}
1663 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1664 		break;
1665 	case RCTL_SUBJECT_TYPE_JAIL:
1666 		prr = filter->rr_subject.rs_prison_racct;
1667 		if (prr == NULL) {
1668 			error = EINVAL;
1669 			goto out;
1670 		}
1671 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1672 		break;
1673 	default:
1674 		error = EINVAL;
1675 	}
1676 out:
1677 	rctl_rule_release(filter);
1678 	sx_sunlock(&allproc_lock);
1679 	if (error != 0)
1680 		return (error);
1681 
1682 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1683 
1684 	return (error);
1685 }
1686 
1687 static void
1688 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1689 {
1690 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1691 	struct rctl_rule_link *link;
1692 	struct sbuf *sb = (struct sbuf *)arg3;
1693 
1694 	ASSERT_RACCT_ENABLED();
1695 	RACCT_LOCK_ASSERT();
1696 
1697 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1698 		if (!rctl_rule_matches(link->rrl_rule, filter))
1699 			continue;
1700 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1701 		sbuf_printf(sb, ",");
1702 	}
1703 }
1704 
1705 int
1706 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1707 {
1708 	struct sbuf *sb;
1709 	struct rctl_rule *filter;
1710 	struct rctl_rule_link *link;
1711 	struct proc *p;
1712 	char *inputstr, *buf;
1713 	size_t bufsize;
1714 	int error;
1715 
1716 	if (!racct_enable)
1717 		return (ENOSYS);
1718 
1719 	error = priv_check(td, PRIV_RCTL_GET_RULES);
1720 	if (error != 0)
1721 		return (error);
1722 
1723 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1724 	if (error != 0)
1725 		return (error);
1726 
1727 	sx_slock(&allproc_lock);
1728 	error = rctl_string_to_rule(inputstr, &filter);
1729 	free(inputstr, M_RCTL);
1730 	if (error != 0) {
1731 		sx_sunlock(&allproc_lock);
1732 		return (error);
1733 	}
1734 
1735 	bufsize = uap->outbuflen;
1736 	if (bufsize > rctl_maxbufsize) {
1737 		sx_sunlock(&allproc_lock);
1738 		return (E2BIG);
1739 	}
1740 
1741 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1742 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1743 	KASSERT(sb != NULL, ("sbuf_new failed"));
1744 
1745 	FOREACH_PROC_IN_SYSTEM(p) {
1746 		RACCT_LOCK();
1747 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1748 			/*
1749 			 * Non-process rules will be added to the buffer later.
1750 			 * Adding them here would result in duplicated output.
1751 			 */
1752 			if (link->rrl_rule->rr_subject_type !=
1753 			    RCTL_SUBJECT_TYPE_PROCESS)
1754 				continue;
1755 			if (!rctl_rule_matches(link->rrl_rule, filter))
1756 				continue;
1757 			rctl_rule_to_sbuf(sb, link->rrl_rule);
1758 			sbuf_printf(sb, ",");
1759 		}
1760 		RACCT_UNLOCK();
1761 	}
1762 
1763 	loginclass_racct_foreach(rctl_get_rules_callback,
1764 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1765 	    filter, sb);
1766 	ui_racct_foreach(rctl_get_rules_callback,
1767 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1768 	    filter, sb);
1769 	prison_racct_foreach(rctl_get_rules_callback,
1770 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1771 	    filter, sb);
1772 	if (sbuf_error(sb) == ENOMEM) {
1773 		error = ERANGE;
1774 		goto out;
1775 	}
1776 
1777 	/*
1778 	 * Remove trailing ",".
1779 	 */
1780 	if (sbuf_len(sb) > 0)
1781 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1782 
1783 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1784 out:
1785 	rctl_rule_release(filter);
1786 	sx_sunlock(&allproc_lock);
1787 	free(buf, M_RCTL);
1788 	return (error);
1789 }
1790 
1791 int
1792 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1793 {
1794 	struct sbuf *sb;
1795 	struct rctl_rule *filter;
1796 	struct rctl_rule_link *link;
1797 	char *inputstr, *buf;
1798 	size_t bufsize;
1799 	int error;
1800 
1801 	if (!racct_enable)
1802 		return (ENOSYS);
1803 
1804 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1805 	if (error != 0)
1806 		return (error);
1807 
1808 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1809 	if (error != 0)
1810 		return (error);
1811 
1812 	sx_slock(&allproc_lock);
1813 	error = rctl_string_to_rule(inputstr, &filter);
1814 	free(inputstr, M_RCTL);
1815 	if (error != 0) {
1816 		sx_sunlock(&allproc_lock);
1817 		return (error);
1818 	}
1819 
1820 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1821 		rctl_rule_release(filter);
1822 		sx_sunlock(&allproc_lock);
1823 		return (EINVAL);
1824 	}
1825 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1826 		rctl_rule_release(filter);
1827 		sx_sunlock(&allproc_lock);
1828 		return (EOPNOTSUPP);
1829 	}
1830 	if (filter->rr_subject.rs_proc == NULL) {
1831 		rctl_rule_release(filter);
1832 		sx_sunlock(&allproc_lock);
1833 		return (EINVAL);
1834 	}
1835 
1836 	bufsize = uap->outbuflen;
1837 	if (bufsize > rctl_maxbufsize) {
1838 		rctl_rule_release(filter);
1839 		sx_sunlock(&allproc_lock);
1840 		return (E2BIG);
1841 	}
1842 
1843 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1844 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1845 	KASSERT(sb != NULL, ("sbuf_new failed"));
1846 
1847 	RACCT_LOCK();
1848 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1849 	    rrl_next) {
1850 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1851 		sbuf_printf(sb, ",");
1852 	}
1853 	RACCT_UNLOCK();
1854 	if (sbuf_error(sb) == ENOMEM) {
1855 		error = ERANGE;
1856 		sbuf_delete(sb);
1857 		goto out;
1858 	}
1859 
1860 	/*
1861 	 * Remove trailing ",".
1862 	 */
1863 	if (sbuf_len(sb) > 0)
1864 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1865 
1866 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1867 out:
1868 	rctl_rule_release(filter);
1869 	sx_sunlock(&allproc_lock);
1870 	free(buf, M_RCTL);
1871 	return (error);
1872 }
1873 
1874 int
1875 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1876 {
1877 	struct rctl_rule *rule;
1878 	char *inputstr;
1879 	int error;
1880 
1881 	if (!racct_enable)
1882 		return (ENOSYS);
1883 
1884 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1885 	if (error != 0)
1886 		return (error);
1887 
1888 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1889 	if (error != 0)
1890 		return (error);
1891 
1892 	sx_slock(&allproc_lock);
1893 	error = rctl_string_to_rule(inputstr, &rule);
1894 	free(inputstr, M_RCTL);
1895 	if (error != 0) {
1896 		sx_sunlock(&allproc_lock);
1897 		return (error);
1898 	}
1899 	/*
1900 	 * The 'per' part of a rule is optional.
1901 	 */
1902 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1903 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1904 		rule->rr_per = rule->rr_subject_type;
1905 
1906 	if (!rctl_rule_fully_specified(rule)) {
1907 		error = EINVAL;
1908 		goto out;
1909 	}
1910 
1911 	error = rctl_rule_add(rule);
1912 
1913 out:
1914 	rctl_rule_release(rule);
1915 	sx_sunlock(&allproc_lock);
1916 	return (error);
1917 }
1918 
1919 int
1920 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1921 {
1922 	struct rctl_rule *filter;
1923 	char *inputstr;
1924 	int error;
1925 
1926 	if (!racct_enable)
1927 		return (ENOSYS);
1928 
1929 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1930 	if (error != 0)
1931 		return (error);
1932 
1933 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1934 	if (error != 0)
1935 		return (error);
1936 
1937 	sx_slock(&allproc_lock);
1938 	error = rctl_string_to_rule(inputstr, &filter);
1939 	free(inputstr, M_RCTL);
1940 	if (error != 0) {
1941 		sx_sunlock(&allproc_lock);
1942 		return (error);
1943 	}
1944 
1945 	error = rctl_rule_remove(filter);
1946 	rctl_rule_release(filter);
1947 	sx_sunlock(&allproc_lock);
1948 
1949 	return (error);
1950 }
1951 
1952 /*
1953  * Update RCTL rule list after credential change.
1954  */
1955 void
1956 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1957 {
1958 	LIST_HEAD(, rctl_rule_link) newrules;
1959 	struct rctl_rule_link *link, *newlink;
1960 	struct uidinfo *newuip;
1961 	struct loginclass *newlc;
1962 	struct prison_racct *newprr;
1963 	int rulecnt, i;
1964 
1965 	if (!racct_enable)
1966 		return;
1967 
1968 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1969 
1970 	newuip = newcred->cr_ruidinfo;
1971 	newlc = newcred->cr_loginclass;
1972 	newprr = newcred->cr_prison->pr_prison_racct;
1973 
1974 	LIST_INIT(&newrules);
1975 
1976 again:
1977 	/*
1978 	 * First, count the rules that apply to the process with new
1979 	 * credentials.
1980 	 */
1981 	rulecnt = 0;
1982 	RACCT_LOCK();
1983 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1984 		if (link->rrl_rule->rr_subject_type ==
1985 		    RCTL_SUBJECT_TYPE_PROCESS)
1986 			rulecnt++;
1987 	}
1988 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1989 		rulecnt++;
1990 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1991 		rulecnt++;
1992 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1993 		rulecnt++;
1994 	RACCT_UNLOCK();
1995 
1996 	/*
1997 	 * Create temporary list.  We've dropped the rctl_lock in order
1998 	 * to use M_WAITOK.
1999 	 */
2000 	for (i = 0; i < rulecnt; i++) {
2001 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2002 		newlink->rrl_rule = NULL;
2003 		newlink->rrl_exceeded = 0;
2004 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2005 	}
2006 
2007 	newlink = LIST_FIRST(&newrules);
2008 
2009 	/*
2010 	 * Assign rules to the newly allocated list entries.
2011 	 */
2012 	RACCT_LOCK();
2013 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2014 		if (link->rrl_rule->rr_subject_type ==
2015 		    RCTL_SUBJECT_TYPE_PROCESS) {
2016 			if (newlink == NULL)
2017 				goto goaround;
2018 			rctl_rule_acquire(link->rrl_rule);
2019 			newlink->rrl_rule = link->rrl_rule;
2020 			newlink->rrl_exceeded = link->rrl_exceeded;
2021 			newlink = LIST_NEXT(newlink, rrl_next);
2022 			rulecnt--;
2023 		}
2024 	}
2025 
2026 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2027 		if (newlink == NULL)
2028 			goto goaround;
2029 		rctl_rule_acquire(link->rrl_rule);
2030 		newlink->rrl_rule = link->rrl_rule;
2031 		newlink->rrl_exceeded = link->rrl_exceeded;
2032 		newlink = LIST_NEXT(newlink, rrl_next);
2033 		rulecnt--;
2034 	}
2035 
2036 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2037 		if (newlink == NULL)
2038 			goto goaround;
2039 		rctl_rule_acquire(link->rrl_rule);
2040 		newlink->rrl_rule = link->rrl_rule;
2041 		newlink->rrl_exceeded = link->rrl_exceeded;
2042 		newlink = LIST_NEXT(newlink, rrl_next);
2043 		rulecnt--;
2044 	}
2045 
2046 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2047 		if (newlink == NULL)
2048 			goto goaround;
2049 		rctl_rule_acquire(link->rrl_rule);
2050 		newlink->rrl_rule = link->rrl_rule;
2051 		newlink->rrl_exceeded = link->rrl_exceeded;
2052 		newlink = LIST_NEXT(newlink, rrl_next);
2053 		rulecnt--;
2054 	}
2055 
2056 	if (rulecnt == 0) {
2057 		/*
2058 		 * Free the old rule list.
2059 		 */
2060 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2061 			link = LIST_FIRST(&p->p_racct->r_rule_links);
2062 			LIST_REMOVE(link, rrl_next);
2063 			rctl_rule_release(link->rrl_rule);
2064 			uma_zfree(rctl_rule_link_zone, link);
2065 		}
2066 
2067 		/*
2068 		 * Replace lists and we're done.
2069 		 *
2070 		 * XXX: Is there any way to switch list heads instead
2071 		 *      of iterating here?
2072 		 */
2073 		while (!LIST_EMPTY(&newrules)) {
2074 			newlink = LIST_FIRST(&newrules);
2075 			LIST_REMOVE(newlink, rrl_next);
2076 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2077 			    newlink, rrl_next);
2078 		}
2079 
2080 		RACCT_UNLOCK();
2081 
2082 		return;
2083 	}
2084 
2085 goaround:
2086 	RACCT_UNLOCK();
2087 
2088 	/*
2089 	 * Rule list changed while we were not holding the rctl_lock.
2090 	 * Free the new list and try again.
2091 	 */
2092 	while (!LIST_EMPTY(&newrules)) {
2093 		newlink = LIST_FIRST(&newrules);
2094 		LIST_REMOVE(newlink, rrl_next);
2095 		if (newlink->rrl_rule != NULL)
2096 			rctl_rule_release(newlink->rrl_rule);
2097 		uma_zfree(rctl_rule_link_zone, newlink);
2098 	}
2099 
2100 	goto again;
2101 }
2102 
2103 /*
2104  * Assign RCTL rules to the newly created process.
2105  */
2106 int
2107 rctl_proc_fork(struct proc *parent, struct proc *child)
2108 {
2109 	struct rctl_rule *rule;
2110 	struct rctl_rule_link *link;
2111 	int error;
2112 
2113 	ASSERT_RACCT_ENABLED();
2114 	RACCT_LOCK_ASSERT();
2115 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2116 
2117 	LIST_INIT(&child->p_racct->r_rule_links);
2118 
2119 	/*
2120 	 * Go through limits applicable to the parent and assign them
2121 	 * to the child.  Rules with 'process' subject have to be duplicated
2122 	 * in order to make their rr_subject point to the new process.
2123 	 */
2124 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2125 		if (link->rrl_rule->rr_subject_type ==
2126 		    RCTL_SUBJECT_TYPE_PROCESS) {
2127 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2128 			if (rule == NULL)
2129 				goto fail;
2130 			KASSERT(rule->rr_subject.rs_proc == parent,
2131 			    ("rule->rr_subject.rs_proc != parent"));
2132 			rule->rr_subject.rs_proc = child;
2133 			error = rctl_racct_add_rule_locked(child->p_racct,
2134 			    rule);
2135 			rctl_rule_release(rule);
2136 			if (error != 0)
2137 				goto fail;
2138 		} else {
2139 			error = rctl_racct_add_rule_locked(child->p_racct,
2140 			    link->rrl_rule);
2141 			if (error != 0)
2142 				goto fail;
2143 		}
2144 	}
2145 
2146 	return (0);
2147 
2148 fail:
2149 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2150 		link = LIST_FIRST(&child->p_racct->r_rule_links);
2151 		LIST_REMOVE(link, rrl_next);
2152 		rctl_rule_release(link->rrl_rule);
2153 		uma_zfree(rctl_rule_link_zone, link);
2154 	}
2155 
2156 	return (EAGAIN);
2157 }
2158 
2159 /*
2160  * Release rules attached to the racct.
2161  */
2162 void
2163 rctl_racct_release(struct racct *racct)
2164 {
2165 	struct rctl_rule_link *link;
2166 
2167 	ASSERT_RACCT_ENABLED();
2168 	RACCT_LOCK_ASSERT();
2169 
2170 	while (!LIST_EMPTY(&racct->r_rule_links)) {
2171 		link = LIST_FIRST(&racct->r_rule_links);
2172 		LIST_REMOVE(link, rrl_next);
2173 		rctl_rule_release(link->rrl_rule);
2174 		uma_zfree(rctl_rule_link_zone, link);
2175 	}
2176 }
2177 
2178 static void
2179 rctl_init(void)
2180 {
2181 
2182 	if (!racct_enable)
2183 		return;
2184 
2185 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2186 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2187 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2188 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2189 	    UMA_ALIGN_PTR, 0);
2190 
2191 	/*
2192 	 * Set default values, making sure not to overwrite the ones
2193 	 * fetched from tunables.  Most of those could be set at the
2194 	 * declaration, except for the rctl_throttle_max - we cannot
2195 	 * set it there due to hz not being compile time constant.
2196 	 */
2197 	if (rctl_throttle_min < 1)
2198 		rctl_throttle_min = 1;
2199 	if (rctl_throttle_max < rctl_throttle_min)
2200 		rctl_throttle_max = 2 * hz;
2201 	if (rctl_throttle_pct < 0)
2202 		rctl_throttle_pct = 100;
2203 	if (rctl_throttle_pct2 < 0)
2204 		rctl_throttle_pct2 = 100;
2205 }
2206 
2207 #else /* !RCTL */
2208 
2209 int
2210 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2211 {
2212 
2213 	return (ENOSYS);
2214 }
2215 
2216 int
2217 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2218 {
2219 
2220 	return (ENOSYS);
2221 }
2222 
2223 int
2224 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2225 {
2226 
2227 	return (ENOSYS);
2228 }
2229 
2230 int
2231 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2232 {
2233 
2234 	return (ENOSYS);
2235 }
2236 
2237 int
2238 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2239 {
2240 
2241 	return (ENOSYS);
2242 }
2243 
2244 #endif /* !RCTL */
2245