xref: /freebsd/sys/kern/kern_rctl.c (revision 58a08f9e9910ea986e0f1103f47274a781b11874)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010 The FreeBSD Foundation
5  *
6  * This software was developed by Edward Tomasz Napierala under sponsorship
7  * from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * $FreeBSD$
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include <sys/param.h>
37 #include <sys/devctl.h>
38 #include <sys/malloc.h>
39 #include <sys/queue.h>
40 #include <sys/refcount.h>
41 #include <sys/jail.h>
42 #include <sys/kernel.h>
43 #include <sys/limits.h>
44 #include <sys/loginclass.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/racct.h>
48 #include <sys/rctl.h>
49 #include <sys/resourcevar.h>
50 #include <sys/sx.h>
51 #include <sys/sysent.h>
52 #include <sys/sysproto.h>
53 #include <sys/systm.h>
54 #include <sys/types.h>
55 #include <sys/eventhandler.h>
56 #include <sys/lock.h>
57 #include <sys/mutex.h>
58 #include <sys/rwlock.h>
59 #include <sys/sbuf.h>
60 #include <sys/taskqueue.h>
61 #include <sys/tree.h>
62 #include <vm/uma.h>
63 
64 #ifdef RCTL
65 #ifndef RACCT
66 #error "The RCTL option requires the RACCT option"
67 #endif
68 
69 FEATURE(rctl, "Resource Limits");
70 
71 #define	HRF_DEFAULT		0
72 #define	HRF_DONT_INHERIT	1
73 #define	HRF_DONT_ACCUMULATE	2
74 
75 #define	RCTL_MAX_INBUFSIZE	4 * 1024
76 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
77 #define	RCTL_LOG_BUFSIZE	128
78 
79 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
80 
81 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
82 static int rctl_log_rate_limit = 10;
83 static int rctl_devctl_rate_limit = 10;
84 
85 /*
86  * Values below are initialized in rctl_init().
87  */
88 static int rctl_throttle_min = -1;
89 static int rctl_throttle_max = -1;
90 static int rctl_throttle_pct = -1;
91 static int rctl_throttle_pct2 = -1;
92 
93 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
94 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
96 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
97 
98 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
99     "Resource Limits");
100 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
101     &rctl_maxbufsize, 0, "Maximum output buffer size");
102 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
103     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
104 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
105     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
106 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
107     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
108     &rctl_throttle_min_sysctl, "IU",
109     "Shortest throttling duration, in hz");
110 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
111 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
112     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
113     &rctl_throttle_max_sysctl, "IU",
114     "Longest throttling duration, in hz");
115 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
116 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
117     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
118     &rctl_throttle_pct_sysctl, "IU",
119     "Throttling penalty for process consumption, in percent");
120 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
121 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
122     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
123     &rctl_throttle_pct2_sysctl, "IU",
124     "Throttling penalty for container consumption, in percent");
125 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
126 
127 /*
128  * 'rctl_rule_link' connects a rule with every racct it's related to.
129  * For example, rule 'user:X:openfiles:deny=N/process' is linked
130  * with uidinfo for user X, and to each process of that user.
131  */
132 struct rctl_rule_link {
133 	LIST_ENTRY(rctl_rule_link)	rrl_next;
134 	struct rctl_rule		*rrl_rule;
135 	int				rrl_exceeded;
136 };
137 
138 struct dict {
139 	const char	*d_name;
140 	int		d_value;
141 };
142 
143 static struct dict subjectnames[] = {
144 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
145 	{ "user", RCTL_SUBJECT_TYPE_USER },
146 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
147 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
148 	{ NULL, -1 }};
149 
150 static struct dict resourcenames[] = {
151 	{ "cputime", RACCT_CPU },
152 	{ "datasize", RACCT_DATA },
153 	{ "stacksize", RACCT_STACK },
154 	{ "coredumpsize", RACCT_CORE },
155 	{ "memoryuse", RACCT_RSS },
156 	{ "memorylocked", RACCT_MEMLOCK },
157 	{ "maxproc", RACCT_NPROC },
158 	{ "openfiles", RACCT_NOFILE },
159 	{ "vmemoryuse", RACCT_VMEM },
160 	{ "pseudoterminals", RACCT_NPTS },
161 	{ "swapuse", RACCT_SWAP },
162 	{ "nthr", RACCT_NTHR },
163 	{ "msgqqueued", RACCT_MSGQQUEUED },
164 	{ "msgqsize", RACCT_MSGQSIZE },
165 	{ "nmsgq", RACCT_NMSGQ },
166 	{ "nsem", RACCT_NSEM },
167 	{ "nsemop", RACCT_NSEMOP },
168 	{ "nshm", RACCT_NSHM },
169 	{ "shmsize", RACCT_SHMSIZE },
170 	{ "wallclock", RACCT_WALLCLOCK },
171 	{ "pcpu", RACCT_PCTCPU },
172 	{ "readbps", RACCT_READBPS },
173 	{ "writebps", RACCT_WRITEBPS },
174 	{ "readiops", RACCT_READIOPS },
175 	{ "writeiops", RACCT_WRITEIOPS },
176 	{ NULL, -1 }};
177 
178 static struct dict actionnames[] = {
179 	{ "sighup", RCTL_ACTION_SIGHUP },
180 	{ "sigint", RCTL_ACTION_SIGINT },
181 	{ "sigquit", RCTL_ACTION_SIGQUIT },
182 	{ "sigill", RCTL_ACTION_SIGILL },
183 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
184 	{ "sigabrt", RCTL_ACTION_SIGABRT },
185 	{ "sigemt", RCTL_ACTION_SIGEMT },
186 	{ "sigfpe", RCTL_ACTION_SIGFPE },
187 	{ "sigkill", RCTL_ACTION_SIGKILL },
188 	{ "sigbus", RCTL_ACTION_SIGBUS },
189 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
190 	{ "sigsys", RCTL_ACTION_SIGSYS },
191 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
192 	{ "sigalrm", RCTL_ACTION_SIGALRM },
193 	{ "sigterm", RCTL_ACTION_SIGTERM },
194 	{ "sigurg", RCTL_ACTION_SIGURG },
195 	{ "sigstop", RCTL_ACTION_SIGSTOP },
196 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
197 	{ "sigchld", RCTL_ACTION_SIGCHLD },
198 	{ "sigttin", RCTL_ACTION_SIGTTIN },
199 	{ "sigttou", RCTL_ACTION_SIGTTOU },
200 	{ "sigio", RCTL_ACTION_SIGIO },
201 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
202 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
203 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
204 	{ "sigprof", RCTL_ACTION_SIGPROF },
205 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
206 	{ "siginfo", RCTL_ACTION_SIGINFO },
207 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
208 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
209 	{ "sigthr", RCTL_ACTION_SIGTHR },
210 	{ "deny", RCTL_ACTION_DENY },
211 	{ "log", RCTL_ACTION_LOG },
212 	{ "devctl", RCTL_ACTION_DEVCTL },
213 	{ "throttle", RCTL_ACTION_THROTTLE },
214 	{ NULL, -1 }};
215 
216 static void rctl_init(void);
217 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
218 
219 static uma_zone_t rctl_rule_zone;
220 static uma_zone_t rctl_rule_link_zone;
221 
222 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
223 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
224 
225 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
226 
227 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
228 {
229 	int error, val = rctl_throttle_min;
230 
231 	error = sysctl_handle_int(oidp, &val, 0, req);
232 	if (error || !req->newptr)
233 		return (error);
234 	if (val < 1 || val > rctl_throttle_max)
235 		return (EINVAL);
236 
237 	RACCT_LOCK();
238 	rctl_throttle_min = val;
239 	RACCT_UNLOCK();
240 
241 	return (0);
242 }
243 
244 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
245 {
246 	int error, val = rctl_throttle_max;
247 
248 	error = sysctl_handle_int(oidp, &val, 0, req);
249 	if (error || !req->newptr)
250 		return (error);
251 	if (val < rctl_throttle_min)
252 		return (EINVAL);
253 
254 	RACCT_LOCK();
255 	rctl_throttle_max = val;
256 	RACCT_UNLOCK();
257 
258 	return (0);
259 }
260 
261 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
262 {
263 	int error, val = rctl_throttle_pct;
264 
265 	error = sysctl_handle_int(oidp, &val, 0, req);
266 	if (error || !req->newptr)
267 		return (error);
268 	if (val < 0)
269 		return (EINVAL);
270 
271 	RACCT_LOCK();
272 	rctl_throttle_pct = val;
273 	RACCT_UNLOCK();
274 
275 	return (0);
276 }
277 
278 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
279 {
280 	int error, val = rctl_throttle_pct2;
281 
282 	error = sysctl_handle_int(oidp, &val, 0, req);
283 	if (error || !req->newptr)
284 		return (error);
285 	if (val < 0)
286 		return (EINVAL);
287 
288 	RACCT_LOCK();
289 	rctl_throttle_pct2 = val;
290 	RACCT_UNLOCK();
291 
292 	return (0);
293 }
294 
295 static const char *
296 rctl_subject_type_name(int subject)
297 {
298 	int i;
299 
300 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
301 		if (subjectnames[i].d_value == subject)
302 			return (subjectnames[i].d_name);
303 	}
304 
305 	panic("rctl_subject_type_name: unknown subject type %d", subject);
306 }
307 
308 static const char *
309 rctl_action_name(int action)
310 {
311 	int i;
312 
313 	for (i = 0; actionnames[i].d_name != NULL; i++) {
314 		if (actionnames[i].d_value == action)
315 			return (actionnames[i].d_name);
316 	}
317 
318 	panic("rctl_action_name: unknown action %d", action);
319 }
320 
321 const char *
322 rctl_resource_name(int resource)
323 {
324 	int i;
325 
326 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
327 		if (resourcenames[i].d_value == resource)
328 			return (resourcenames[i].d_name);
329 	}
330 
331 	panic("rctl_resource_name: unknown resource %d", resource);
332 }
333 
334 static struct racct *
335 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
336 {
337 	struct ucred *cred = p->p_ucred;
338 
339 	ASSERT_RACCT_ENABLED();
340 	RACCT_LOCK_ASSERT();
341 
342 	switch (rule->rr_per) {
343 	case RCTL_SUBJECT_TYPE_PROCESS:
344 		return (p->p_racct);
345 	case RCTL_SUBJECT_TYPE_USER:
346 		return (cred->cr_ruidinfo->ui_racct);
347 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
348 		return (cred->cr_loginclass->lc_racct);
349 	case RCTL_SUBJECT_TYPE_JAIL:
350 		return (cred->cr_prison->pr_prison_racct->prr_racct);
351 	default:
352 		panic("%s: unknown per %d", __func__, rule->rr_per);
353 	}
354 }
355 
356 /*
357  * Return the amount of resource that can be allocated by 'p' before
358  * hitting 'rule'.
359  */
360 static int64_t
361 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
362 {
363 	const struct racct *racct;
364 	int64_t available;
365 
366 	ASSERT_RACCT_ENABLED();
367 	RACCT_LOCK_ASSERT();
368 
369 	racct = rctl_proc_rule_to_racct(p, rule);
370 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
371 
372 	return (available);
373 }
374 
375 /*
376  * Called every second for proc, uidinfo, loginclass, and jail containers.
377  * If the limit isn't exceeded, it decreases the usage amount to zero.
378  * Otherwise, it decreases it by the value of the limit.  This way
379  * resource consumption exceeding the limit "carries over" to the next
380  * period.
381  */
382 void
383 rctl_throttle_decay(struct racct *racct, int resource)
384 {
385 	struct rctl_rule *rule;
386 	struct rctl_rule_link *link;
387 	int64_t minavailable;
388 
389 	ASSERT_RACCT_ENABLED();
390 	RACCT_LOCK_ASSERT();
391 
392 	minavailable = INT64_MAX;
393 
394 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
395 		rule = link->rrl_rule;
396 
397 		if (rule->rr_resource != resource)
398 			continue;
399 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
400 			continue;
401 
402 		if (rule->rr_amount < minavailable)
403 			minavailable = rule->rr_amount;
404 	}
405 
406 	if (racct->r_resources[resource] < minavailable) {
407 		racct->r_resources[resource] = 0;
408 	} else {
409 		/*
410 		 * Cap utilization counter at ten times the limit.  Otherwise,
411 		 * if we changed the rule lowering the allowed amount, it could
412 		 * take unreasonably long time for the accumulated resource
413 		 * usage to drop.
414 		 */
415 		if (racct->r_resources[resource] > minavailable * 10)
416 			racct->r_resources[resource] = minavailable * 10;
417 
418 		racct->r_resources[resource] -= minavailable;
419 	}
420 }
421 
422 /*
423  * Special version of rctl_get_available() for the %CPU resource.
424  * We slightly cheat here and return less than we normally would.
425  */
426 int64_t
427 rctl_pcpu_available(const struct proc *p) {
428 	struct rctl_rule *rule;
429 	struct rctl_rule_link *link;
430 	int64_t available, minavailable, limit;
431 
432 	ASSERT_RACCT_ENABLED();
433 	RACCT_LOCK_ASSERT();
434 
435 	minavailable = INT64_MAX;
436 	limit = 0;
437 
438 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
439 		rule = link->rrl_rule;
440 		if (rule->rr_resource != RACCT_PCTCPU)
441 			continue;
442 		if (rule->rr_action != RCTL_ACTION_DENY)
443 			continue;
444 		available = rctl_available_resource(p, rule);
445 		if (available < minavailable) {
446 			minavailable = available;
447 			limit = rule->rr_amount;
448 		}
449 	}
450 
451 	/*
452 	 * Return slightly less than actual value of the available
453 	 * %cpu resource.  This makes %cpu throttling more aggressive
454 	 * and lets us act sooner than the limits are already exceeded.
455 	 */
456 	if (limit != 0) {
457 		if (limit > 2 * RCTL_PCPU_SHIFT)
458 			minavailable -= RCTL_PCPU_SHIFT;
459 		else
460 			minavailable -= (limit / 2);
461 	}
462 
463 	return (minavailable);
464 }
465 
466 static uint64_t
467 xadd(uint64_t a, uint64_t b)
468 {
469 	uint64_t c;
470 
471 	c = a + b;
472 
473 	/*
474 	 * Detect overflow.
475 	 */
476 	if (c < a || c < b)
477 		return (UINT64_MAX);
478 
479 	return (c);
480 }
481 
482 static uint64_t
483 xmul(uint64_t a, uint64_t b)
484 {
485 
486 	if (b != 0 && a > UINT64_MAX / b)
487 		return (UINT64_MAX);
488 
489 	return (a * b);
490 }
491 
492 /*
493  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
494  * to what it keeps allocated now.  Returns non-zero if the allocation should
495  * be denied, 0 otherwise.
496  */
497 int
498 rctl_enforce(struct proc *p, int resource, uint64_t amount)
499 {
500 	static struct timeval log_lasttime, devctl_lasttime;
501 	static int log_curtime = 0, devctl_curtime = 0;
502 	struct rctl_rule *rule;
503 	struct rctl_rule_link *link;
504 	struct sbuf sb;
505 	char *buf;
506 	int64_t available;
507 	uint64_t sleep_ms, sleep_ratio;
508 	int should_deny = 0;
509 
510 	ASSERT_RACCT_ENABLED();
511 	RACCT_LOCK_ASSERT();
512 
513 	/*
514 	 * There may be more than one matching rule; go through all of them.
515 	 * Denial should be done last, after logging and sending signals.
516 	 */
517 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
518 		rule = link->rrl_rule;
519 		if (rule->rr_resource != resource)
520 			continue;
521 
522 		available = rctl_available_resource(p, rule);
523 		if (available >= (int64_t)amount) {
524 			link->rrl_exceeded = 0;
525 			continue;
526 		}
527 
528 		switch (rule->rr_action) {
529 		case RCTL_ACTION_DENY:
530 			should_deny = 1;
531 			continue;
532 		case RCTL_ACTION_LOG:
533 			/*
534 			 * If rrl_exceeded != 0, it means we've already
535 			 * logged a warning for this process.
536 			 */
537 			if (link->rrl_exceeded != 0)
538 				continue;
539 
540 			/*
541 			 * If the process state is not fully initialized yet,
542 			 * we can't access most of the required fields, e.g.
543 			 * p->p_comm.  This happens when called from fork1().
544 			 * Ignore this rule for now; it will be processed just
545 			 * after fork, when called from racct_proc_fork_done().
546 			 */
547 			if (p->p_state != PRS_NORMAL)
548 				continue;
549 
550 			if (!ppsratecheck(&log_lasttime, &log_curtime,
551 			    rctl_log_rate_limit))
552 				continue;
553 
554 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
555 			if (buf == NULL) {
556 				printf("rctl_enforce: out of memory\n");
557 				continue;
558 			}
559 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
560 			rctl_rule_to_sbuf(&sb, rule);
561 			sbuf_finish(&sb);
562 			printf("rctl: rule \"%s\" matched by pid %d "
563 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
564 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
565 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
566 			sbuf_delete(&sb);
567 			free(buf, M_RCTL);
568 			link->rrl_exceeded = 1;
569 			continue;
570 		case RCTL_ACTION_DEVCTL:
571 			if (link->rrl_exceeded != 0)
572 				continue;
573 
574 			if (p->p_state != PRS_NORMAL)
575 				continue;
576 
577 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
578 			    rctl_devctl_rate_limit))
579 				continue;
580 
581 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
582 			if (buf == NULL) {
583 				printf("rctl_enforce: out of memory\n");
584 				continue;
585 			}
586 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
587 			sbuf_printf(&sb, "rule=");
588 			rctl_rule_to_sbuf(&sb, rule);
589 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
590 			    p->p_pid, p->p_ucred->cr_ruid,
591 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
592 			sbuf_finish(&sb);
593 			devctl_notify("RCTL", "rule", "matched",
594 			    sbuf_data(&sb));
595 			sbuf_delete(&sb);
596 			free(buf, M_RCTL);
597 			link->rrl_exceeded = 1;
598 			continue;
599 		case RCTL_ACTION_THROTTLE:
600 			if (p->p_state != PRS_NORMAL)
601 				continue;
602 
603 			if (rule->rr_amount == 0) {
604 				racct_proc_throttle(p, rctl_throttle_max);
605 				continue;
606 			}
607 
608 			/*
609 			 * Make the process sleep for a fraction of second
610 			 * proportional to the ratio of process' resource
611 			 * utilization compared to the limit.  The point is
612 			 * to penalize resource hogs: processes that consume
613 			 * more of the available resources sleep for longer.
614 			 *
615 			 * We're trying to defer division until the very end,
616 			 * to minimize the rounding effects.  The following
617 			 * calculation could have been written in a clearer
618 			 * way like this:
619 			 *
620 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
621 			 *     rule->rr_amount;
622 			 * sleep_ms *= rctl_throttle_pct / 100;
623 			 * if (sleep_ms < rctl_throttle_min)
624 			 *         sleep_ms = rctl_throttle_min;
625 			 *
626 			 */
627 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
628 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
629 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
630 				sleep_ms = rctl_throttle_min * rule->rr_amount;
631 
632 			/*
633 			 * Multiply that by the ratio of the resource
634 			 * consumption for the container compared to the limit,
635 			 * squared.  In other words, a process in a container
636 			 * that is two times over the limit will be throttled
637 			 * four times as much for hitting the same rule.  The
638 			 * point is to penalize processes more if the container
639 			 * itself (eg certain UID or jail) is above the limit.
640 			 */
641 			if (available < 0)
642 				sleep_ratio = -available / rule->rr_amount;
643 			else
644 				sleep_ratio = 0;
645 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
646 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
647 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
648 
649 			/*
650 			 * Finally the division.
651 			 */
652 			sleep_ms /= rule->rr_amount;
653 
654 			if (sleep_ms > rctl_throttle_max)
655 				sleep_ms = rctl_throttle_max;
656 #if 0
657 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
658 			   __func__, p->p_pid, p->p_comm,
659 			   p->p_racct->r_resources[resource],
660 			   rule->rr_amount, (uintmax_t)sleep_ms,
661 			   (uintmax_t)sleep_ratio, (intmax_t)available);
662 #endif
663 
664 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
665 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
666 			racct_proc_throttle(p, sleep_ms);
667 			continue;
668 		default:
669 			if (link->rrl_exceeded != 0)
670 				continue;
671 
672 			if (p->p_state != PRS_NORMAL)
673 				continue;
674 
675 			KASSERT(rule->rr_action > 0 &&
676 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
677 			    ("rctl_enforce: unknown action %d",
678 			     rule->rr_action));
679 
680 			/*
681 			 * We're using the fact that RCTL_ACTION_SIG* values
682 			 * are equal to their counterparts from sys/signal.h.
683 			 */
684 			kern_psignal(p, rule->rr_action);
685 			link->rrl_exceeded = 1;
686 			continue;
687 		}
688 	}
689 
690 	if (should_deny) {
691 		/*
692 		 * Return fake error code; the caller should change it
693 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
694 		 */
695 		return (EDOOFUS);
696 	}
697 
698 	return (0);
699 }
700 
701 uint64_t
702 rctl_get_limit(struct proc *p, int resource)
703 {
704 	struct rctl_rule *rule;
705 	struct rctl_rule_link *link;
706 	uint64_t amount = UINT64_MAX;
707 
708 	ASSERT_RACCT_ENABLED();
709 	RACCT_LOCK_ASSERT();
710 
711 	/*
712 	 * There may be more than one matching rule; go through all of them.
713 	 * Denial should be done last, after logging and sending signals.
714 	 */
715 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
716 		rule = link->rrl_rule;
717 		if (rule->rr_resource != resource)
718 			continue;
719 		if (rule->rr_action != RCTL_ACTION_DENY)
720 			continue;
721 		if (rule->rr_amount < amount)
722 			amount = rule->rr_amount;
723 	}
724 
725 	return (amount);
726 }
727 
728 uint64_t
729 rctl_get_available(struct proc *p, int resource)
730 {
731 	struct rctl_rule *rule;
732 	struct rctl_rule_link *link;
733 	int64_t available, minavailable, allocated;
734 
735 	minavailable = INT64_MAX;
736 
737 	ASSERT_RACCT_ENABLED();
738 	RACCT_LOCK_ASSERT();
739 
740 	/*
741 	 * There may be more than one matching rule; go through all of them.
742 	 * Denial should be done last, after logging and sending signals.
743 	 */
744 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
745 		rule = link->rrl_rule;
746 		if (rule->rr_resource != resource)
747 			continue;
748 		if (rule->rr_action != RCTL_ACTION_DENY)
749 			continue;
750 		available = rctl_available_resource(p, rule);
751 		if (available < minavailable)
752 			minavailable = available;
753 	}
754 
755 	/*
756 	 * XXX: Think about this _hard_.
757 	 */
758 	allocated = p->p_racct->r_resources[resource];
759 	if (minavailable < INT64_MAX - allocated)
760 		minavailable += allocated;
761 	if (minavailable < 0)
762 		minavailable = 0;
763 
764 	return (minavailable);
765 }
766 
767 static int
768 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
769 {
770 
771 	ASSERT_RACCT_ENABLED();
772 
773 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
774 		if (rule->rr_subject_type != filter->rr_subject_type)
775 			return (0);
776 
777 		switch (filter->rr_subject_type) {
778 		case RCTL_SUBJECT_TYPE_PROCESS:
779 			if (filter->rr_subject.rs_proc != NULL &&
780 			    rule->rr_subject.rs_proc !=
781 			    filter->rr_subject.rs_proc)
782 				return (0);
783 			break;
784 		case RCTL_SUBJECT_TYPE_USER:
785 			if (filter->rr_subject.rs_uip != NULL &&
786 			    rule->rr_subject.rs_uip !=
787 			    filter->rr_subject.rs_uip)
788 				return (0);
789 			break;
790 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
791 			if (filter->rr_subject.rs_loginclass != NULL &&
792 			    rule->rr_subject.rs_loginclass !=
793 			    filter->rr_subject.rs_loginclass)
794 				return (0);
795 			break;
796 		case RCTL_SUBJECT_TYPE_JAIL:
797 			if (filter->rr_subject.rs_prison_racct != NULL &&
798 			    rule->rr_subject.rs_prison_racct !=
799 			    filter->rr_subject.rs_prison_racct)
800 				return (0);
801 			break;
802 		default:
803 			panic("rctl_rule_matches: unknown subject type %d",
804 			    filter->rr_subject_type);
805 		}
806 	}
807 
808 	if (filter->rr_resource != RACCT_UNDEFINED) {
809 		if (rule->rr_resource != filter->rr_resource)
810 			return (0);
811 	}
812 
813 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
814 		if (rule->rr_action != filter->rr_action)
815 			return (0);
816 	}
817 
818 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
819 		if (rule->rr_amount != filter->rr_amount)
820 			return (0);
821 	}
822 
823 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
824 		if (rule->rr_per != filter->rr_per)
825 			return (0);
826 	}
827 
828 	return (1);
829 }
830 
831 static int
832 str2value(const char *str, int *value, struct dict *table)
833 {
834 	int i;
835 
836 	if (value == NULL)
837 		return (EINVAL);
838 
839 	for (i = 0; table[i].d_name != NULL; i++) {
840 		if (strcasecmp(table[i].d_name, str) == 0) {
841 			*value =  table[i].d_value;
842 			return (0);
843 		}
844 	}
845 
846 	return (EINVAL);
847 }
848 
849 static int
850 str2id(const char *str, id_t *value)
851 {
852 	char *end;
853 
854 	if (str == NULL)
855 		return (EINVAL);
856 
857 	*value = strtoul(str, &end, 10);
858 	if ((size_t)(end - str) != strlen(str))
859 		return (EINVAL);
860 
861 	return (0);
862 }
863 
864 static int
865 str2int64(const char *str, int64_t *value)
866 {
867 	char *end;
868 
869 	if (str == NULL)
870 		return (EINVAL);
871 
872 	*value = strtoul(str, &end, 10);
873 	if ((size_t)(end - str) != strlen(str))
874 		return (EINVAL);
875 
876 	if (*value < 0)
877 		return (ERANGE);
878 
879 	return (0);
880 }
881 
882 /*
883  * Connect the rule to the racct, increasing refcount for the rule.
884  */
885 static void
886 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
887 {
888 	struct rctl_rule_link *link;
889 
890 	ASSERT_RACCT_ENABLED();
891 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
892 
893 	rctl_rule_acquire(rule);
894 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
895 	link->rrl_rule = rule;
896 	link->rrl_exceeded = 0;
897 
898 	RACCT_LOCK();
899 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
900 	RACCT_UNLOCK();
901 }
902 
903 static int
904 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
905 {
906 	struct rctl_rule_link *link;
907 
908 	ASSERT_RACCT_ENABLED();
909 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
910 	RACCT_LOCK_ASSERT();
911 
912 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
913 	if (link == NULL)
914 		return (ENOMEM);
915 	rctl_rule_acquire(rule);
916 	link->rrl_rule = rule;
917 	link->rrl_exceeded = 0;
918 
919 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
920 
921 	return (0);
922 }
923 
924 /*
925  * Remove limits for a rules matching the filter and release
926  * the refcounts for the rules, possibly freeing them.  Returns
927  * the number of limit structures removed.
928  */
929 static int
930 rctl_racct_remove_rules(struct racct *racct,
931     const struct rctl_rule *filter)
932 {
933 	struct rctl_rule_link *link, *linktmp;
934 	int removed = 0;
935 
936 	ASSERT_RACCT_ENABLED();
937 	RACCT_LOCK_ASSERT();
938 
939 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
940 		if (!rctl_rule_matches(link->rrl_rule, filter))
941 			continue;
942 
943 		LIST_REMOVE(link, rrl_next);
944 		rctl_rule_release(link->rrl_rule);
945 		uma_zfree(rctl_rule_link_zone, link);
946 		removed++;
947 	}
948 	return (removed);
949 }
950 
951 static void
952 rctl_rule_acquire_subject(struct rctl_rule *rule)
953 {
954 
955 	ASSERT_RACCT_ENABLED();
956 
957 	switch (rule->rr_subject_type) {
958 	case RCTL_SUBJECT_TYPE_UNDEFINED:
959 	case RCTL_SUBJECT_TYPE_PROCESS:
960 		break;
961 	case RCTL_SUBJECT_TYPE_JAIL:
962 		if (rule->rr_subject.rs_prison_racct != NULL)
963 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
964 		break;
965 	case RCTL_SUBJECT_TYPE_USER:
966 		if (rule->rr_subject.rs_uip != NULL)
967 			uihold(rule->rr_subject.rs_uip);
968 		break;
969 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
970 		if (rule->rr_subject.rs_loginclass != NULL)
971 			loginclass_hold(rule->rr_subject.rs_loginclass);
972 		break;
973 	default:
974 		panic("rctl_rule_acquire_subject: unknown subject type %d",
975 		    rule->rr_subject_type);
976 	}
977 }
978 
979 static void
980 rctl_rule_release_subject(struct rctl_rule *rule)
981 {
982 
983 	ASSERT_RACCT_ENABLED();
984 
985 	switch (rule->rr_subject_type) {
986 	case RCTL_SUBJECT_TYPE_UNDEFINED:
987 	case RCTL_SUBJECT_TYPE_PROCESS:
988 		break;
989 	case RCTL_SUBJECT_TYPE_JAIL:
990 		if (rule->rr_subject.rs_prison_racct != NULL)
991 			prison_racct_free(rule->rr_subject.rs_prison_racct);
992 		break;
993 	case RCTL_SUBJECT_TYPE_USER:
994 		if (rule->rr_subject.rs_uip != NULL)
995 			uifree(rule->rr_subject.rs_uip);
996 		break;
997 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
998 		if (rule->rr_subject.rs_loginclass != NULL)
999 			loginclass_free(rule->rr_subject.rs_loginclass);
1000 		break;
1001 	default:
1002 		panic("rctl_rule_release_subject: unknown subject type %d",
1003 		    rule->rr_subject_type);
1004 	}
1005 }
1006 
1007 struct rctl_rule *
1008 rctl_rule_alloc(int flags)
1009 {
1010 	struct rctl_rule *rule;
1011 
1012 	ASSERT_RACCT_ENABLED();
1013 
1014 	rule = uma_zalloc(rctl_rule_zone, flags);
1015 	if (rule == NULL)
1016 		return (NULL);
1017 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1018 	rule->rr_subject.rs_proc = NULL;
1019 	rule->rr_subject.rs_uip = NULL;
1020 	rule->rr_subject.rs_loginclass = NULL;
1021 	rule->rr_subject.rs_prison_racct = NULL;
1022 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1023 	rule->rr_resource = RACCT_UNDEFINED;
1024 	rule->rr_action = RCTL_ACTION_UNDEFINED;
1025 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1026 	refcount_init(&rule->rr_refcount, 1);
1027 
1028 	return (rule);
1029 }
1030 
1031 struct rctl_rule *
1032 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1033 {
1034 	struct rctl_rule *copy;
1035 
1036 	ASSERT_RACCT_ENABLED();
1037 
1038 	copy = uma_zalloc(rctl_rule_zone, flags);
1039 	if (copy == NULL)
1040 		return (NULL);
1041 	copy->rr_subject_type = rule->rr_subject_type;
1042 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1043 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1044 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1045 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1046 	copy->rr_per = rule->rr_per;
1047 	copy->rr_resource = rule->rr_resource;
1048 	copy->rr_action = rule->rr_action;
1049 	copy->rr_amount = rule->rr_amount;
1050 	refcount_init(&copy->rr_refcount, 1);
1051 	rctl_rule_acquire_subject(copy);
1052 
1053 	return (copy);
1054 }
1055 
1056 void
1057 rctl_rule_acquire(struct rctl_rule *rule)
1058 {
1059 
1060 	ASSERT_RACCT_ENABLED();
1061 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1062 
1063 	refcount_acquire(&rule->rr_refcount);
1064 }
1065 
1066 static void
1067 rctl_rule_free(void *context, int pending)
1068 {
1069 	struct rctl_rule *rule;
1070 
1071 	rule = (struct rctl_rule *)context;
1072 
1073 	ASSERT_RACCT_ENABLED();
1074 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1075 
1076 	/*
1077 	 * We don't need locking here; rule is guaranteed to be inaccessible.
1078 	 */
1079 
1080 	rctl_rule_release_subject(rule);
1081 	uma_zfree(rctl_rule_zone, rule);
1082 }
1083 
1084 void
1085 rctl_rule_release(struct rctl_rule *rule)
1086 {
1087 
1088 	ASSERT_RACCT_ENABLED();
1089 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1090 
1091 	if (refcount_release(&rule->rr_refcount)) {
1092 		/*
1093 		 * rctl_rule_release() is often called when iterating
1094 		 * over all the uidinfo structures in the system,
1095 		 * holding uihashtbl_lock.  Since rctl_rule_free()
1096 		 * might end up calling uifree(), this would lead
1097 		 * to lock recursion.  Use taskqueue to avoid this.
1098 		 */
1099 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1100 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1101 	}
1102 }
1103 
1104 static int
1105 rctl_rule_fully_specified(const struct rctl_rule *rule)
1106 {
1107 
1108 	ASSERT_RACCT_ENABLED();
1109 
1110 	switch (rule->rr_subject_type) {
1111 	case RCTL_SUBJECT_TYPE_UNDEFINED:
1112 		return (0);
1113 	case RCTL_SUBJECT_TYPE_PROCESS:
1114 		if (rule->rr_subject.rs_proc == NULL)
1115 			return (0);
1116 		break;
1117 	case RCTL_SUBJECT_TYPE_USER:
1118 		if (rule->rr_subject.rs_uip == NULL)
1119 			return (0);
1120 		break;
1121 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1122 		if (rule->rr_subject.rs_loginclass == NULL)
1123 			return (0);
1124 		break;
1125 	case RCTL_SUBJECT_TYPE_JAIL:
1126 		if (rule->rr_subject.rs_prison_racct == NULL)
1127 			return (0);
1128 		break;
1129 	default:
1130 		panic("rctl_rule_fully_specified: unknown subject type %d",
1131 		    rule->rr_subject_type);
1132 	}
1133 	if (rule->rr_resource == RACCT_UNDEFINED)
1134 		return (0);
1135 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1136 		return (0);
1137 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1138 		return (0);
1139 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1140 		return (0);
1141 
1142 	return (1);
1143 }
1144 
1145 static int
1146 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1147 {
1148 	struct rctl_rule *rule;
1149 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1150 	     *amountstr, *perstr;
1151 	id_t id;
1152 	int error = 0;
1153 
1154 	ASSERT_RACCT_ENABLED();
1155 
1156 	rule = rctl_rule_alloc(M_WAITOK);
1157 
1158 	subjectstr = strsep(&rulestr, ":");
1159 	subject_idstr = strsep(&rulestr, ":");
1160 	resourcestr = strsep(&rulestr, ":");
1161 	actionstr = strsep(&rulestr, "=/");
1162 	amountstr = strsep(&rulestr, "/");
1163 	perstr = rulestr;
1164 
1165 	if (subjectstr == NULL || subjectstr[0] == '\0')
1166 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1167 	else {
1168 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1169 		if (error != 0)
1170 			goto out;
1171 	}
1172 
1173 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1174 		rule->rr_subject.rs_proc = NULL;
1175 		rule->rr_subject.rs_uip = NULL;
1176 		rule->rr_subject.rs_loginclass = NULL;
1177 		rule->rr_subject.rs_prison_racct = NULL;
1178 	} else {
1179 		switch (rule->rr_subject_type) {
1180 		case RCTL_SUBJECT_TYPE_UNDEFINED:
1181 			error = EINVAL;
1182 			goto out;
1183 		case RCTL_SUBJECT_TYPE_PROCESS:
1184 			error = str2id(subject_idstr, &id);
1185 			if (error != 0)
1186 				goto out;
1187 			sx_assert(&allproc_lock, SA_LOCKED);
1188 			rule->rr_subject.rs_proc = pfind(id);
1189 			if (rule->rr_subject.rs_proc == NULL) {
1190 				error = ESRCH;
1191 				goto out;
1192 			}
1193 			PROC_UNLOCK(rule->rr_subject.rs_proc);
1194 			break;
1195 		case RCTL_SUBJECT_TYPE_USER:
1196 			error = str2id(subject_idstr, &id);
1197 			if (error != 0)
1198 				goto out;
1199 			rule->rr_subject.rs_uip = uifind(id);
1200 			break;
1201 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1202 			rule->rr_subject.rs_loginclass =
1203 			    loginclass_find(subject_idstr);
1204 			if (rule->rr_subject.rs_loginclass == NULL) {
1205 				error = ENAMETOOLONG;
1206 				goto out;
1207 			}
1208 			break;
1209 		case RCTL_SUBJECT_TYPE_JAIL:
1210 			rule->rr_subject.rs_prison_racct =
1211 			    prison_racct_find(subject_idstr);
1212 			if (rule->rr_subject.rs_prison_racct == NULL) {
1213 				error = ENAMETOOLONG;
1214 				goto out;
1215 			}
1216 			break;
1217                default:
1218                        panic("rctl_string_to_rule: unknown subject type %d",
1219                            rule->rr_subject_type);
1220                }
1221 	}
1222 
1223 	if (resourcestr == NULL || resourcestr[0] == '\0')
1224 		rule->rr_resource = RACCT_UNDEFINED;
1225 	else {
1226 		error = str2value(resourcestr, &rule->rr_resource,
1227 		    resourcenames);
1228 		if (error != 0)
1229 			goto out;
1230 	}
1231 
1232 	if (actionstr == NULL || actionstr[0] == '\0')
1233 		rule->rr_action = RCTL_ACTION_UNDEFINED;
1234 	else {
1235 		error = str2value(actionstr, &rule->rr_action, actionnames);
1236 		if (error != 0)
1237 			goto out;
1238 	}
1239 
1240 	if (amountstr == NULL || amountstr[0] == '\0')
1241 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1242 	else {
1243 		error = str2int64(amountstr, &rule->rr_amount);
1244 		if (error != 0)
1245 			goto out;
1246 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1247 			if (rule->rr_amount > INT64_MAX / 1000000) {
1248 				error = ERANGE;
1249 				goto out;
1250 			}
1251 			rule->rr_amount *= 1000000;
1252 		}
1253 	}
1254 
1255 	if (perstr == NULL || perstr[0] == '\0')
1256 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1257 	else {
1258 		error = str2value(perstr, &rule->rr_per, subjectnames);
1259 		if (error != 0)
1260 			goto out;
1261 	}
1262 
1263 out:
1264 	if (error == 0)
1265 		*rulep = rule;
1266 	else
1267 		rctl_rule_release(rule);
1268 
1269 	return (error);
1270 }
1271 
1272 /*
1273  * Link a rule with all the subjects it applies to.
1274  */
1275 int
1276 rctl_rule_add(struct rctl_rule *rule)
1277 {
1278 	struct proc *p;
1279 	struct ucred *cred;
1280 	struct uidinfo *uip;
1281 	struct prison *pr;
1282 	struct prison_racct *prr;
1283 	struct loginclass *lc;
1284 	struct rctl_rule *rule2;
1285 	int match;
1286 
1287 	ASSERT_RACCT_ENABLED();
1288 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1289 
1290 	/*
1291 	 * Some rules just don't make sense, like "deny" rule for an undeniable
1292 	 * resource.  The exception are the RSS and %CPU resources - they are
1293 	 * not deniable in the racct sense, but the limit is enforced in
1294 	 * a different way.
1295 	 */
1296 	if (rule->rr_action == RCTL_ACTION_DENY &&
1297 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1298 	    rule->rr_resource != RACCT_RSS &&
1299 	    rule->rr_resource != RACCT_PCTCPU) {
1300 		return (EOPNOTSUPP);
1301 	}
1302 
1303 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1304 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1305 		return (EOPNOTSUPP);
1306 	}
1307 
1308 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1309 	    rule->rr_resource == RACCT_PCTCPU) {
1310 		return (EOPNOTSUPP);
1311 	}
1312 
1313 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1314 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1315 		return (EOPNOTSUPP);
1316 	}
1317 
1318 	/*
1319 	 * Make sure there are no duplicated rules.  Also, for the "deny"
1320 	 * rules, remove ones differing only by "amount".
1321 	 */
1322 	if (rule->rr_action == RCTL_ACTION_DENY) {
1323 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1324 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1325 		rctl_rule_remove(rule2);
1326 		rctl_rule_release(rule2);
1327 	} else
1328 		rctl_rule_remove(rule);
1329 
1330 	switch (rule->rr_subject_type) {
1331 	case RCTL_SUBJECT_TYPE_PROCESS:
1332 		p = rule->rr_subject.rs_proc;
1333 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1334 
1335 		rctl_racct_add_rule(p->p_racct, rule);
1336 		/*
1337 		 * In case of per-process rule, we don't have anything more
1338 		 * to do.
1339 		 */
1340 		return (0);
1341 
1342 	case RCTL_SUBJECT_TYPE_USER:
1343 		uip = rule->rr_subject.rs_uip;
1344 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1345 		rctl_racct_add_rule(uip->ui_racct, rule);
1346 		break;
1347 
1348 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1349 		lc = rule->rr_subject.rs_loginclass;
1350 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1351 		rctl_racct_add_rule(lc->lc_racct, rule);
1352 		break;
1353 
1354 	case RCTL_SUBJECT_TYPE_JAIL:
1355 		prr = rule->rr_subject.rs_prison_racct;
1356 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1357 		rctl_racct_add_rule(prr->prr_racct, rule);
1358 		break;
1359 
1360 	default:
1361 		panic("rctl_rule_add: unknown subject type %d",
1362 		    rule->rr_subject_type);
1363 	}
1364 
1365 	/*
1366 	 * Now go through all the processes and add the new rule to the ones
1367 	 * it applies to.
1368 	 */
1369 	sx_assert(&allproc_lock, SA_LOCKED);
1370 	FOREACH_PROC_IN_SYSTEM(p) {
1371 		cred = p->p_ucred;
1372 		switch (rule->rr_subject_type) {
1373 		case RCTL_SUBJECT_TYPE_USER:
1374 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1375 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1376 				break;
1377 			continue;
1378 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1379 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1380 				break;
1381 			continue;
1382 		case RCTL_SUBJECT_TYPE_JAIL:
1383 			match = 0;
1384 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1385 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1386 					match = 1;
1387 					break;
1388 				}
1389 			}
1390 			if (match)
1391 				break;
1392 			continue;
1393 		default:
1394 			panic("rctl_rule_add: unknown subject type %d",
1395 			    rule->rr_subject_type);
1396 		}
1397 
1398 		rctl_racct_add_rule(p->p_racct, rule);
1399 	}
1400 
1401 	return (0);
1402 }
1403 
1404 static void
1405 rctl_rule_pre_callback(void)
1406 {
1407 
1408 	RACCT_LOCK();
1409 }
1410 
1411 static void
1412 rctl_rule_post_callback(void)
1413 {
1414 
1415 	RACCT_UNLOCK();
1416 }
1417 
1418 static void
1419 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1420 {
1421 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1422 	int found = 0;
1423 
1424 	ASSERT_RACCT_ENABLED();
1425 	RACCT_LOCK_ASSERT();
1426 
1427 	found += rctl_racct_remove_rules(racct, filter);
1428 
1429 	*((int *)arg3) += found;
1430 }
1431 
1432 /*
1433  * Remove all rules that match the filter.
1434  */
1435 int
1436 rctl_rule_remove(struct rctl_rule *filter)
1437 {
1438 	struct proc *p;
1439 	int found = 0;
1440 
1441 	ASSERT_RACCT_ENABLED();
1442 
1443 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1444 	    filter->rr_subject.rs_proc != NULL) {
1445 		p = filter->rr_subject.rs_proc;
1446 		RACCT_LOCK();
1447 		found = rctl_racct_remove_rules(p->p_racct, filter);
1448 		RACCT_UNLOCK();
1449 		if (found)
1450 			return (0);
1451 		return (ESRCH);
1452 	}
1453 
1454 	loginclass_racct_foreach(rctl_rule_remove_callback,
1455 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1456 	    filter, (void *)&found);
1457 	ui_racct_foreach(rctl_rule_remove_callback,
1458 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1459 	    filter, (void *)&found);
1460 	prison_racct_foreach(rctl_rule_remove_callback,
1461 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1462 	    filter, (void *)&found);
1463 
1464 	sx_assert(&allproc_lock, SA_LOCKED);
1465 	RACCT_LOCK();
1466 	FOREACH_PROC_IN_SYSTEM(p) {
1467 		found += rctl_racct_remove_rules(p->p_racct, filter);
1468 	}
1469 	RACCT_UNLOCK();
1470 
1471 	if (found)
1472 		return (0);
1473 	return (ESRCH);
1474 }
1475 
1476 /*
1477  * Appends a rule to the sbuf.
1478  */
1479 static void
1480 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1481 {
1482 	int64_t amount;
1483 
1484 	ASSERT_RACCT_ENABLED();
1485 
1486 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1487 
1488 	switch (rule->rr_subject_type) {
1489 	case RCTL_SUBJECT_TYPE_PROCESS:
1490 		if (rule->rr_subject.rs_proc == NULL)
1491 			sbuf_printf(sb, ":");
1492 		else
1493 			sbuf_printf(sb, "%d:",
1494 			    rule->rr_subject.rs_proc->p_pid);
1495 		break;
1496 	case RCTL_SUBJECT_TYPE_USER:
1497 		if (rule->rr_subject.rs_uip == NULL)
1498 			sbuf_printf(sb, ":");
1499 		else
1500 			sbuf_printf(sb, "%d:",
1501 			    rule->rr_subject.rs_uip->ui_uid);
1502 		break;
1503 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1504 		if (rule->rr_subject.rs_loginclass == NULL)
1505 			sbuf_printf(sb, ":");
1506 		else
1507 			sbuf_printf(sb, "%s:",
1508 			    rule->rr_subject.rs_loginclass->lc_name);
1509 		break;
1510 	case RCTL_SUBJECT_TYPE_JAIL:
1511 		if (rule->rr_subject.rs_prison_racct == NULL)
1512 			sbuf_printf(sb, ":");
1513 		else
1514 			sbuf_printf(sb, "%s:",
1515 			    rule->rr_subject.rs_prison_racct->prr_name);
1516 		break;
1517 	default:
1518 		panic("rctl_rule_to_sbuf: unknown subject type %d",
1519 		    rule->rr_subject_type);
1520 	}
1521 
1522 	amount = rule->rr_amount;
1523 	if (amount != RCTL_AMOUNT_UNDEFINED &&
1524 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1525 		amount /= 1000000;
1526 
1527 	sbuf_printf(sb, "%s:%s=%jd",
1528 	    rctl_resource_name(rule->rr_resource),
1529 	    rctl_action_name(rule->rr_action),
1530 	    amount);
1531 
1532 	if (rule->rr_per != rule->rr_subject_type)
1533 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1534 }
1535 
1536 /*
1537  * Routine used by RCTL syscalls to read in input string.
1538  */
1539 static int
1540 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1541 {
1542 	char *str;
1543 	int error;
1544 
1545 	ASSERT_RACCT_ENABLED();
1546 
1547 	if (inbuflen <= 0)
1548 		return (EINVAL);
1549 	if (inbuflen > RCTL_MAX_INBUFSIZE)
1550 		return (E2BIG);
1551 
1552 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1553 	error = copyinstr(inbufp, str, inbuflen, NULL);
1554 	if (error != 0) {
1555 		free(str, M_RCTL);
1556 		return (error);
1557 	}
1558 
1559 	*inputstr = str;
1560 
1561 	return (0);
1562 }
1563 
1564 /*
1565  * Routine used by RCTL syscalls to write out output string.
1566  */
1567 static int
1568 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1569 {
1570 	int error;
1571 
1572 	ASSERT_RACCT_ENABLED();
1573 
1574 	if (outputsbuf == NULL)
1575 		return (0);
1576 
1577 	sbuf_finish(outputsbuf);
1578 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1579 		sbuf_delete(outputsbuf);
1580 		return (ERANGE);
1581 	}
1582 	error = copyout(sbuf_data(outputsbuf), outbufp,
1583 	    sbuf_len(outputsbuf) + 1);
1584 	sbuf_delete(outputsbuf);
1585 	return (error);
1586 }
1587 
1588 static struct sbuf *
1589 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1590 {
1591 	struct sbuf *sb;
1592 	int64_t amount;
1593 	int i;
1594 
1595 	ASSERT_RACCT_ENABLED();
1596 
1597 	sb = sbuf_new_auto();
1598 	for (i = 0; i <= RACCT_MAX; i++) {
1599 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1600 			continue;
1601 		RACCT_LOCK();
1602 		amount = racct->r_resources[i];
1603 		RACCT_UNLOCK();
1604 		if (RACCT_IS_IN_MILLIONS(i))
1605 			amount /= 1000000;
1606 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1607 	}
1608 	sbuf_setpos(sb, sbuf_len(sb) - 1);
1609 	return (sb);
1610 }
1611 
1612 int
1613 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1614 {
1615 	struct rctl_rule *filter;
1616 	struct sbuf *outputsbuf = NULL;
1617 	struct proc *p;
1618 	struct uidinfo *uip;
1619 	struct loginclass *lc;
1620 	struct prison_racct *prr;
1621 	char *inputstr;
1622 	int error;
1623 
1624 	if (!racct_enable)
1625 		return (ENOSYS);
1626 
1627 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1628 	if (error != 0)
1629 		return (error);
1630 
1631 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1632 	if (error != 0)
1633 		return (error);
1634 
1635 	sx_slock(&allproc_lock);
1636 	error = rctl_string_to_rule(inputstr, &filter);
1637 	free(inputstr, M_RCTL);
1638 	if (error != 0) {
1639 		sx_sunlock(&allproc_lock);
1640 		return (error);
1641 	}
1642 
1643 	switch (filter->rr_subject_type) {
1644 	case RCTL_SUBJECT_TYPE_PROCESS:
1645 		p = filter->rr_subject.rs_proc;
1646 		if (p == NULL) {
1647 			error = EINVAL;
1648 			goto out;
1649 		}
1650 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1651 		break;
1652 	case RCTL_SUBJECT_TYPE_USER:
1653 		uip = filter->rr_subject.rs_uip;
1654 		if (uip == NULL) {
1655 			error = EINVAL;
1656 			goto out;
1657 		}
1658 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1659 		break;
1660 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1661 		lc = filter->rr_subject.rs_loginclass;
1662 		if (lc == NULL) {
1663 			error = EINVAL;
1664 			goto out;
1665 		}
1666 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1667 		break;
1668 	case RCTL_SUBJECT_TYPE_JAIL:
1669 		prr = filter->rr_subject.rs_prison_racct;
1670 		if (prr == NULL) {
1671 			error = EINVAL;
1672 			goto out;
1673 		}
1674 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1675 		break;
1676 	default:
1677 		error = EINVAL;
1678 	}
1679 out:
1680 	rctl_rule_release(filter);
1681 	sx_sunlock(&allproc_lock);
1682 	if (error != 0)
1683 		return (error);
1684 
1685 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1686 
1687 	return (error);
1688 }
1689 
1690 static void
1691 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1692 {
1693 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1694 	struct rctl_rule_link *link;
1695 	struct sbuf *sb = (struct sbuf *)arg3;
1696 
1697 	ASSERT_RACCT_ENABLED();
1698 	RACCT_LOCK_ASSERT();
1699 
1700 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1701 		if (!rctl_rule_matches(link->rrl_rule, filter))
1702 			continue;
1703 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1704 		sbuf_printf(sb, ",");
1705 	}
1706 }
1707 
1708 int
1709 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1710 {
1711 	struct sbuf *sb;
1712 	struct rctl_rule *filter;
1713 	struct rctl_rule_link *link;
1714 	struct proc *p;
1715 	char *inputstr, *buf;
1716 	size_t bufsize;
1717 	int error;
1718 
1719 	if (!racct_enable)
1720 		return (ENOSYS);
1721 
1722 	error = priv_check(td, PRIV_RCTL_GET_RULES);
1723 	if (error != 0)
1724 		return (error);
1725 
1726 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1727 	if (error != 0)
1728 		return (error);
1729 
1730 	sx_slock(&allproc_lock);
1731 	error = rctl_string_to_rule(inputstr, &filter);
1732 	free(inputstr, M_RCTL);
1733 	if (error != 0) {
1734 		sx_sunlock(&allproc_lock);
1735 		return (error);
1736 	}
1737 
1738 	bufsize = uap->outbuflen;
1739 	if (bufsize > rctl_maxbufsize) {
1740 		sx_sunlock(&allproc_lock);
1741 		return (E2BIG);
1742 	}
1743 
1744 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1745 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1746 	KASSERT(sb != NULL, ("sbuf_new failed"));
1747 
1748 	FOREACH_PROC_IN_SYSTEM(p) {
1749 		RACCT_LOCK();
1750 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1751 			/*
1752 			 * Non-process rules will be added to the buffer later.
1753 			 * Adding them here would result in duplicated output.
1754 			 */
1755 			if (link->rrl_rule->rr_subject_type !=
1756 			    RCTL_SUBJECT_TYPE_PROCESS)
1757 				continue;
1758 			if (!rctl_rule_matches(link->rrl_rule, filter))
1759 				continue;
1760 			rctl_rule_to_sbuf(sb, link->rrl_rule);
1761 			sbuf_printf(sb, ",");
1762 		}
1763 		RACCT_UNLOCK();
1764 	}
1765 
1766 	loginclass_racct_foreach(rctl_get_rules_callback,
1767 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1768 	    filter, sb);
1769 	ui_racct_foreach(rctl_get_rules_callback,
1770 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1771 	    filter, sb);
1772 	prison_racct_foreach(rctl_get_rules_callback,
1773 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1774 	    filter, sb);
1775 	if (sbuf_error(sb) == ENOMEM) {
1776 		error = ERANGE;
1777 		goto out;
1778 	}
1779 
1780 	/*
1781 	 * Remove trailing ",".
1782 	 */
1783 	if (sbuf_len(sb) > 0)
1784 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1785 
1786 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1787 out:
1788 	rctl_rule_release(filter);
1789 	sx_sunlock(&allproc_lock);
1790 	free(buf, M_RCTL);
1791 	return (error);
1792 }
1793 
1794 int
1795 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1796 {
1797 	struct sbuf *sb;
1798 	struct rctl_rule *filter;
1799 	struct rctl_rule_link *link;
1800 	char *inputstr, *buf;
1801 	size_t bufsize;
1802 	int error;
1803 
1804 	if (!racct_enable)
1805 		return (ENOSYS);
1806 
1807 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1808 	if (error != 0)
1809 		return (error);
1810 
1811 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1812 	if (error != 0)
1813 		return (error);
1814 
1815 	sx_slock(&allproc_lock);
1816 	error = rctl_string_to_rule(inputstr, &filter);
1817 	free(inputstr, M_RCTL);
1818 	if (error != 0) {
1819 		sx_sunlock(&allproc_lock);
1820 		return (error);
1821 	}
1822 
1823 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1824 		rctl_rule_release(filter);
1825 		sx_sunlock(&allproc_lock);
1826 		return (EINVAL);
1827 	}
1828 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1829 		rctl_rule_release(filter);
1830 		sx_sunlock(&allproc_lock);
1831 		return (EOPNOTSUPP);
1832 	}
1833 	if (filter->rr_subject.rs_proc == NULL) {
1834 		rctl_rule_release(filter);
1835 		sx_sunlock(&allproc_lock);
1836 		return (EINVAL);
1837 	}
1838 
1839 	bufsize = uap->outbuflen;
1840 	if (bufsize > rctl_maxbufsize) {
1841 		rctl_rule_release(filter);
1842 		sx_sunlock(&allproc_lock);
1843 		return (E2BIG);
1844 	}
1845 
1846 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1847 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1848 	KASSERT(sb != NULL, ("sbuf_new failed"));
1849 
1850 	RACCT_LOCK();
1851 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1852 	    rrl_next) {
1853 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1854 		sbuf_printf(sb, ",");
1855 	}
1856 	RACCT_UNLOCK();
1857 	if (sbuf_error(sb) == ENOMEM) {
1858 		error = ERANGE;
1859 		sbuf_delete(sb);
1860 		goto out;
1861 	}
1862 
1863 	/*
1864 	 * Remove trailing ",".
1865 	 */
1866 	if (sbuf_len(sb) > 0)
1867 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1868 
1869 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1870 out:
1871 	rctl_rule_release(filter);
1872 	sx_sunlock(&allproc_lock);
1873 	free(buf, M_RCTL);
1874 	return (error);
1875 }
1876 
1877 int
1878 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1879 {
1880 	struct rctl_rule *rule;
1881 	char *inputstr;
1882 	int error;
1883 
1884 	if (!racct_enable)
1885 		return (ENOSYS);
1886 
1887 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1888 	if (error != 0)
1889 		return (error);
1890 
1891 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1892 	if (error != 0)
1893 		return (error);
1894 
1895 	sx_slock(&allproc_lock);
1896 	error = rctl_string_to_rule(inputstr, &rule);
1897 	free(inputstr, M_RCTL);
1898 	if (error != 0) {
1899 		sx_sunlock(&allproc_lock);
1900 		return (error);
1901 	}
1902 	/*
1903 	 * The 'per' part of a rule is optional.
1904 	 */
1905 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1906 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1907 		rule->rr_per = rule->rr_subject_type;
1908 
1909 	if (!rctl_rule_fully_specified(rule)) {
1910 		error = EINVAL;
1911 		goto out;
1912 	}
1913 
1914 	error = rctl_rule_add(rule);
1915 
1916 out:
1917 	rctl_rule_release(rule);
1918 	sx_sunlock(&allproc_lock);
1919 	return (error);
1920 }
1921 
1922 int
1923 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1924 {
1925 	struct rctl_rule *filter;
1926 	char *inputstr;
1927 	int error;
1928 
1929 	if (!racct_enable)
1930 		return (ENOSYS);
1931 
1932 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1933 	if (error != 0)
1934 		return (error);
1935 
1936 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1937 	if (error != 0)
1938 		return (error);
1939 
1940 	sx_slock(&allproc_lock);
1941 	error = rctl_string_to_rule(inputstr, &filter);
1942 	free(inputstr, M_RCTL);
1943 	if (error != 0) {
1944 		sx_sunlock(&allproc_lock);
1945 		return (error);
1946 	}
1947 
1948 	error = rctl_rule_remove(filter);
1949 	rctl_rule_release(filter);
1950 	sx_sunlock(&allproc_lock);
1951 
1952 	return (error);
1953 }
1954 
1955 /*
1956  * Update RCTL rule list after credential change.
1957  */
1958 void
1959 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1960 {
1961 	LIST_HEAD(, rctl_rule_link) newrules;
1962 	struct rctl_rule_link *link, *newlink;
1963 	struct uidinfo *newuip;
1964 	struct loginclass *newlc;
1965 	struct prison_racct *newprr;
1966 	int rulecnt, i;
1967 
1968 	if (!racct_enable)
1969 		return;
1970 
1971 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1972 
1973 	newuip = newcred->cr_ruidinfo;
1974 	newlc = newcred->cr_loginclass;
1975 	newprr = newcred->cr_prison->pr_prison_racct;
1976 
1977 	LIST_INIT(&newrules);
1978 
1979 again:
1980 	/*
1981 	 * First, count the rules that apply to the process with new
1982 	 * credentials.
1983 	 */
1984 	rulecnt = 0;
1985 	RACCT_LOCK();
1986 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1987 		if (link->rrl_rule->rr_subject_type ==
1988 		    RCTL_SUBJECT_TYPE_PROCESS)
1989 			rulecnt++;
1990 	}
1991 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1992 		rulecnt++;
1993 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1994 		rulecnt++;
1995 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1996 		rulecnt++;
1997 	RACCT_UNLOCK();
1998 
1999 	/*
2000 	 * Create temporary list.  We've dropped the rctl_lock in order
2001 	 * to use M_WAITOK.
2002 	 */
2003 	for (i = 0; i < rulecnt; i++) {
2004 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2005 		newlink->rrl_rule = NULL;
2006 		newlink->rrl_exceeded = 0;
2007 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2008 	}
2009 
2010 	newlink = LIST_FIRST(&newrules);
2011 
2012 	/*
2013 	 * Assign rules to the newly allocated list entries.
2014 	 */
2015 	RACCT_LOCK();
2016 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2017 		if (link->rrl_rule->rr_subject_type ==
2018 		    RCTL_SUBJECT_TYPE_PROCESS) {
2019 			if (newlink == NULL)
2020 				goto goaround;
2021 			rctl_rule_acquire(link->rrl_rule);
2022 			newlink->rrl_rule = link->rrl_rule;
2023 			newlink->rrl_exceeded = link->rrl_exceeded;
2024 			newlink = LIST_NEXT(newlink, rrl_next);
2025 			rulecnt--;
2026 		}
2027 	}
2028 
2029 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2030 		if (newlink == NULL)
2031 			goto goaround;
2032 		rctl_rule_acquire(link->rrl_rule);
2033 		newlink->rrl_rule = link->rrl_rule;
2034 		newlink->rrl_exceeded = link->rrl_exceeded;
2035 		newlink = LIST_NEXT(newlink, rrl_next);
2036 		rulecnt--;
2037 	}
2038 
2039 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2040 		if (newlink == NULL)
2041 			goto goaround;
2042 		rctl_rule_acquire(link->rrl_rule);
2043 		newlink->rrl_rule = link->rrl_rule;
2044 		newlink->rrl_exceeded = link->rrl_exceeded;
2045 		newlink = LIST_NEXT(newlink, rrl_next);
2046 		rulecnt--;
2047 	}
2048 
2049 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2050 		if (newlink == NULL)
2051 			goto goaround;
2052 		rctl_rule_acquire(link->rrl_rule);
2053 		newlink->rrl_rule = link->rrl_rule;
2054 		newlink->rrl_exceeded = link->rrl_exceeded;
2055 		newlink = LIST_NEXT(newlink, rrl_next);
2056 		rulecnt--;
2057 	}
2058 
2059 	if (rulecnt == 0) {
2060 		/*
2061 		 * Free the old rule list.
2062 		 */
2063 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2064 			link = LIST_FIRST(&p->p_racct->r_rule_links);
2065 			LIST_REMOVE(link, rrl_next);
2066 			rctl_rule_release(link->rrl_rule);
2067 			uma_zfree(rctl_rule_link_zone, link);
2068 		}
2069 
2070 		/*
2071 		 * Replace lists and we're done.
2072 		 *
2073 		 * XXX: Is there any way to switch list heads instead
2074 		 *      of iterating here?
2075 		 */
2076 		while (!LIST_EMPTY(&newrules)) {
2077 			newlink = LIST_FIRST(&newrules);
2078 			LIST_REMOVE(newlink, rrl_next);
2079 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2080 			    newlink, rrl_next);
2081 		}
2082 
2083 		RACCT_UNLOCK();
2084 
2085 		return;
2086 	}
2087 
2088 goaround:
2089 	RACCT_UNLOCK();
2090 
2091 	/*
2092 	 * Rule list changed while we were not holding the rctl_lock.
2093 	 * Free the new list and try again.
2094 	 */
2095 	while (!LIST_EMPTY(&newrules)) {
2096 		newlink = LIST_FIRST(&newrules);
2097 		LIST_REMOVE(newlink, rrl_next);
2098 		if (newlink->rrl_rule != NULL)
2099 			rctl_rule_release(newlink->rrl_rule);
2100 		uma_zfree(rctl_rule_link_zone, newlink);
2101 	}
2102 
2103 	goto again;
2104 }
2105 
2106 /*
2107  * Assign RCTL rules to the newly created process.
2108  */
2109 int
2110 rctl_proc_fork(struct proc *parent, struct proc *child)
2111 {
2112 	struct rctl_rule *rule;
2113 	struct rctl_rule_link *link;
2114 	int error;
2115 
2116 	ASSERT_RACCT_ENABLED();
2117 	RACCT_LOCK_ASSERT();
2118 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2119 
2120 	LIST_INIT(&child->p_racct->r_rule_links);
2121 
2122 	/*
2123 	 * Go through limits applicable to the parent and assign them
2124 	 * to the child.  Rules with 'process' subject have to be duplicated
2125 	 * in order to make their rr_subject point to the new process.
2126 	 */
2127 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2128 		if (link->rrl_rule->rr_subject_type ==
2129 		    RCTL_SUBJECT_TYPE_PROCESS) {
2130 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2131 			if (rule == NULL)
2132 				goto fail;
2133 			KASSERT(rule->rr_subject.rs_proc == parent,
2134 			    ("rule->rr_subject.rs_proc != parent"));
2135 			rule->rr_subject.rs_proc = child;
2136 			error = rctl_racct_add_rule_locked(child->p_racct,
2137 			    rule);
2138 			rctl_rule_release(rule);
2139 			if (error != 0)
2140 				goto fail;
2141 		} else {
2142 			error = rctl_racct_add_rule_locked(child->p_racct,
2143 			    link->rrl_rule);
2144 			if (error != 0)
2145 				goto fail;
2146 		}
2147 	}
2148 
2149 	return (0);
2150 
2151 fail:
2152 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2153 		link = LIST_FIRST(&child->p_racct->r_rule_links);
2154 		LIST_REMOVE(link, rrl_next);
2155 		rctl_rule_release(link->rrl_rule);
2156 		uma_zfree(rctl_rule_link_zone, link);
2157 	}
2158 
2159 	return (EAGAIN);
2160 }
2161 
2162 /*
2163  * Release rules attached to the racct.
2164  */
2165 void
2166 rctl_racct_release(struct racct *racct)
2167 {
2168 	struct rctl_rule_link *link;
2169 
2170 	ASSERT_RACCT_ENABLED();
2171 	RACCT_LOCK_ASSERT();
2172 
2173 	while (!LIST_EMPTY(&racct->r_rule_links)) {
2174 		link = LIST_FIRST(&racct->r_rule_links);
2175 		LIST_REMOVE(link, rrl_next);
2176 		rctl_rule_release(link->rrl_rule);
2177 		uma_zfree(rctl_rule_link_zone, link);
2178 	}
2179 }
2180 
2181 static void
2182 rctl_init(void)
2183 {
2184 
2185 	if (!racct_enable)
2186 		return;
2187 
2188 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2189 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2190 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2191 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2192 	    UMA_ALIGN_PTR, 0);
2193 
2194 	/*
2195 	 * Set default values, making sure not to overwrite the ones
2196 	 * fetched from tunables.  Most of those could be set at the
2197 	 * declaration, except for the rctl_throttle_max - we cannot
2198 	 * set it there due to hz not being compile time constant.
2199 	 */
2200 	if (rctl_throttle_min < 1)
2201 		rctl_throttle_min = 1;
2202 	if (rctl_throttle_max < rctl_throttle_min)
2203 		rctl_throttle_max = 2 * hz;
2204 	if (rctl_throttle_pct < 0)
2205 		rctl_throttle_pct = 100;
2206 	if (rctl_throttle_pct2 < 0)
2207 		rctl_throttle_pct2 = 100;
2208 }
2209 
2210 #else /* !RCTL */
2211 
2212 int
2213 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2214 {
2215 
2216 	return (ENOSYS);
2217 }
2218 
2219 int
2220 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2221 {
2222 
2223 	return (ENOSYS);
2224 }
2225 
2226 int
2227 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2228 {
2229 
2230 	return (ENOSYS);
2231 }
2232 
2233 int
2234 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2235 {
2236 
2237 	return (ENOSYS);
2238 }
2239 
2240 int
2241 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2242 {
2243 
2244 	return (ENOSYS);
2245 }
2246 
2247 #endif /* !RCTL */
2248