xref: /freebsd/sys/kern/kern_rctl.c (revision e5c4075f6b6616fdf7370e11fd948c516e1858d7)
1 /*-
2  * Copyright (c) 2010 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/malloc.h>
38 #include <sys/queue.h>
39 #include <sys/refcount.h>
40 #include <sys/jail.h>
41 #include <sys/kernel.h>
42 #include <sys/limits.h>
43 #include <sys/loginclass.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/racct.h>
47 #include <sys/rctl.h>
48 #include <sys/resourcevar.h>
49 #include <sys/sx.h>
50 #include <sys/sysent.h>
51 #include <sys/sysproto.h>
52 #include <sys/systm.h>
53 #include <sys/types.h>
54 #include <sys/eventhandler.h>
55 #include <sys/lock.h>
56 #include <sys/mutex.h>
57 #include <sys/rwlock.h>
58 #include <sys/sbuf.h>
59 #include <sys/taskqueue.h>
60 #include <sys/tree.h>
61 #include <vm/uma.h>
62 
63 #ifdef RCTL
64 #ifndef RACCT
65 #error "The RCTL option requires the RACCT option"
66 #endif
67 
68 FEATURE(rctl, "Resource Limits");
69 
70 #define	HRF_DEFAULT		0
71 #define	HRF_DONT_INHERIT	1
72 #define	HRF_DONT_ACCUMULATE	2
73 
74 #define	RCTL_MAX_INBUFSIZE	4 * 1024
75 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
76 #define	RCTL_LOG_BUFSIZE	128
77 
78 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
79 
80 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81 static int rctl_log_rate_limit = 10;
82 static int rctl_devctl_rate_limit = 10;
83 
84 /*
85  * Values below are initialized in rctl_init().
86  */
87 static int rctl_throttle_min = -1;
88 static int rctl_throttle_max = -1;
89 static int rctl_throttle_pct = -1;
90 static int rctl_throttle_pct2 = -1;
91 
92 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96 
97 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
98 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
99     &rctl_maxbufsize, 0, "Maximum output buffer size");
100 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
101     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
102 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
103     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
104 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
105     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
106     "Shortest throttling duration, in hz");
107 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
108 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
109     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
110     "Longest throttling duration, in hz");
111 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
112 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
113     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
114     "Throttling penalty for process consumption, in percent");
115 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
116 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
117     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
118     "Throttling penalty for container consumption, in percent");
119 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
120 
121 /*
122  * 'rctl_rule_link' connects a rule with every racct it's related to.
123  * For example, rule 'user:X:openfiles:deny=N/process' is linked
124  * with uidinfo for user X, and to each process of that user.
125  */
126 struct rctl_rule_link {
127 	LIST_ENTRY(rctl_rule_link)	rrl_next;
128 	struct rctl_rule		*rrl_rule;
129 	int				rrl_exceeded;
130 };
131 
132 struct dict {
133 	const char	*d_name;
134 	int		d_value;
135 };
136 
137 static struct dict subjectnames[] = {
138 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
139 	{ "user", RCTL_SUBJECT_TYPE_USER },
140 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
141 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
142 	{ NULL, -1 }};
143 
144 static struct dict resourcenames[] = {
145 	{ "cputime", RACCT_CPU },
146 	{ "datasize", RACCT_DATA },
147 	{ "stacksize", RACCT_STACK },
148 	{ "coredumpsize", RACCT_CORE },
149 	{ "memoryuse", RACCT_RSS },
150 	{ "memorylocked", RACCT_MEMLOCK },
151 	{ "maxproc", RACCT_NPROC },
152 	{ "openfiles", RACCT_NOFILE },
153 	{ "vmemoryuse", RACCT_VMEM },
154 	{ "pseudoterminals", RACCT_NPTS },
155 	{ "swapuse", RACCT_SWAP },
156 	{ "nthr", RACCT_NTHR },
157 	{ "msgqqueued", RACCT_MSGQQUEUED },
158 	{ "msgqsize", RACCT_MSGQSIZE },
159 	{ "nmsgq", RACCT_NMSGQ },
160 	{ "nsem", RACCT_NSEM },
161 	{ "nsemop", RACCT_NSEMOP },
162 	{ "nshm", RACCT_NSHM },
163 	{ "shmsize", RACCT_SHMSIZE },
164 	{ "wallclock", RACCT_WALLCLOCK },
165 	{ "pcpu", RACCT_PCTCPU },
166 	{ "readbps", RACCT_READBPS },
167 	{ "writebps", RACCT_WRITEBPS },
168 	{ "readiops", RACCT_READIOPS },
169 	{ "writeiops", RACCT_WRITEIOPS },
170 	{ NULL, -1 }};
171 
172 static struct dict actionnames[] = {
173 	{ "sighup", RCTL_ACTION_SIGHUP },
174 	{ "sigint", RCTL_ACTION_SIGINT },
175 	{ "sigquit", RCTL_ACTION_SIGQUIT },
176 	{ "sigill", RCTL_ACTION_SIGILL },
177 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
178 	{ "sigabrt", RCTL_ACTION_SIGABRT },
179 	{ "sigemt", RCTL_ACTION_SIGEMT },
180 	{ "sigfpe", RCTL_ACTION_SIGFPE },
181 	{ "sigkill", RCTL_ACTION_SIGKILL },
182 	{ "sigbus", RCTL_ACTION_SIGBUS },
183 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
184 	{ "sigsys", RCTL_ACTION_SIGSYS },
185 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
186 	{ "sigalrm", RCTL_ACTION_SIGALRM },
187 	{ "sigterm", RCTL_ACTION_SIGTERM },
188 	{ "sigurg", RCTL_ACTION_SIGURG },
189 	{ "sigstop", RCTL_ACTION_SIGSTOP },
190 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
191 	{ "sigchld", RCTL_ACTION_SIGCHLD },
192 	{ "sigttin", RCTL_ACTION_SIGTTIN },
193 	{ "sigttou", RCTL_ACTION_SIGTTOU },
194 	{ "sigio", RCTL_ACTION_SIGIO },
195 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
196 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
197 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
198 	{ "sigprof", RCTL_ACTION_SIGPROF },
199 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
200 	{ "siginfo", RCTL_ACTION_SIGINFO },
201 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
202 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
203 	{ "sigthr", RCTL_ACTION_SIGTHR },
204 	{ "deny", RCTL_ACTION_DENY },
205 	{ "log", RCTL_ACTION_LOG },
206 	{ "devctl", RCTL_ACTION_DEVCTL },
207 	{ "throttle", RCTL_ACTION_THROTTLE },
208 	{ NULL, -1 }};
209 
210 static void rctl_init(void);
211 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
212 
213 static uma_zone_t rctl_rule_zone;
214 static uma_zone_t rctl_rule_link_zone;
215 static struct rwlock rctl_lock;
216 RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
217 
218 #define RCTL_RLOCK()		rw_rlock(&rctl_lock)
219 #define RCTL_RUNLOCK()		rw_runlock(&rctl_lock)
220 #define RCTL_WLOCK()		rw_wlock(&rctl_lock)
221 #define RCTL_WUNLOCK()		rw_wunlock(&rctl_lock)
222 #define RCTL_LOCK_ASSERT()	rw_assert(&rctl_lock, RA_LOCKED)
223 #define RCTL_WLOCK_ASSERT()	rw_assert(&rctl_lock, RA_WLOCKED)
224 
225 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
226 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
227 
228 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
229 
230 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
231 {
232 	int error, val = rctl_throttle_min;
233 
234 	error = sysctl_handle_int(oidp, &val, 0, req);
235 	if (error || !req->newptr)
236 		return (error);
237 	if (val < 1 || val > rctl_throttle_max)
238 		return (EINVAL);
239 
240 	RCTL_WLOCK();
241 	rctl_throttle_min = val;
242 	RCTL_WUNLOCK();
243 
244 	return (0);
245 }
246 
247 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
248 {
249 	int error, val = rctl_throttle_max;
250 
251 	error = sysctl_handle_int(oidp, &val, 0, req);
252 	if (error || !req->newptr)
253 		return (error);
254 	if (val < rctl_throttle_min)
255 		return (EINVAL);
256 
257 	RCTL_WLOCK();
258 	rctl_throttle_max = val;
259 	RCTL_WUNLOCK();
260 
261 	return (0);
262 }
263 
264 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
265 {
266 	int error, val = rctl_throttle_pct;
267 
268 	error = sysctl_handle_int(oidp, &val, 0, req);
269 	if (error || !req->newptr)
270 		return (error);
271 	if (val < 0)
272 		return (EINVAL);
273 
274 	RCTL_WLOCK();
275 	rctl_throttle_pct = val;
276 	RCTL_WUNLOCK();
277 
278 	return (0);
279 }
280 
281 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
282 {
283 	int error, val = rctl_throttle_pct2;
284 
285 	error = sysctl_handle_int(oidp, &val, 0, req);
286 	if (error || !req->newptr)
287 		return (error);
288 	if (val < 0)
289 		return (EINVAL);
290 
291 	RCTL_WLOCK();
292 	rctl_throttle_pct2 = val;
293 	RCTL_WUNLOCK();
294 
295 	return (0);
296 }
297 
298 static const char *
299 rctl_subject_type_name(int subject)
300 {
301 	int i;
302 
303 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
304 		if (subjectnames[i].d_value == subject)
305 			return (subjectnames[i].d_name);
306 	}
307 
308 	panic("rctl_subject_type_name: unknown subject type %d", subject);
309 }
310 
311 static const char *
312 rctl_action_name(int action)
313 {
314 	int i;
315 
316 	for (i = 0; actionnames[i].d_name != NULL; i++) {
317 		if (actionnames[i].d_value == action)
318 			return (actionnames[i].d_name);
319 	}
320 
321 	panic("rctl_action_name: unknown action %d", action);
322 }
323 
324 const char *
325 rctl_resource_name(int resource)
326 {
327 	int i;
328 
329 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
330 		if (resourcenames[i].d_value == resource)
331 			return (resourcenames[i].d_name);
332 	}
333 
334 	panic("rctl_resource_name: unknown resource %d", resource);
335 }
336 
337 static struct racct *
338 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
339 {
340 	struct ucred *cred = p->p_ucred;
341 
342 	ASSERT_RACCT_ENABLED();
343 	RCTL_LOCK_ASSERT();
344 
345 	switch (rule->rr_per) {
346 	case RCTL_SUBJECT_TYPE_PROCESS:
347 		return (p->p_racct);
348 	case RCTL_SUBJECT_TYPE_USER:
349 		return (cred->cr_ruidinfo->ui_racct);
350 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
351 		return (cred->cr_loginclass->lc_racct);
352 	case RCTL_SUBJECT_TYPE_JAIL:
353 		return (cred->cr_prison->pr_prison_racct->prr_racct);
354 	default:
355 		panic("%s: unknown per %d", __func__, rule->rr_per);
356 	}
357 }
358 
359 /*
360  * Return the amount of resource that can be allocated by 'p' before
361  * hitting 'rule'.
362  */
363 static int64_t
364 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
365 {
366 	const struct racct *racct;
367 	int64_t available;
368 
369 	ASSERT_RACCT_ENABLED();
370 	RCTL_LOCK_ASSERT();
371 
372 	racct = rctl_proc_rule_to_racct(p, rule);
373 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
374 
375 	return (available);
376 }
377 
378 /*
379  * Called every second for proc, uidinfo, loginclass, and jail containers.
380  * If the limit isn't exceeded, it decreases the usage amount to zero.
381  * Otherwise, it decreases it by the value of the limit.  This way
382  * resource consumption exceeding the limit "carries over" to the next
383  * period.
384  */
385 void
386 rctl_throttle_decay(struct racct *racct, int resource)
387 {
388 	struct rctl_rule *rule;
389 	struct rctl_rule_link *link;
390 	int64_t minavailable;
391 
392 	ASSERT_RACCT_ENABLED();
393 
394 	minavailable = INT64_MAX;
395 
396 	RCTL_RLOCK();
397 
398 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
399 		rule = link->rrl_rule;
400 
401 		if (rule->rr_resource != resource)
402 			continue;
403 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
404 			continue;
405 
406 		if (rule->rr_amount < minavailable)
407 			minavailable = rule->rr_amount;
408 	}
409 
410 	RCTL_RUNLOCK();
411 
412 	if (racct->r_resources[resource] < minavailable) {
413 		racct->r_resources[resource] = 0;
414 	} else {
415 		/*
416 		 * Cap utilization counter at ten times the limit.  Otherwise,
417 		 * if we changed the rule lowering the allowed amount, it could
418 		 * take unreasonably long time for the accumulated resource
419 		 * usage to drop.
420 		 */
421 		if (racct->r_resources[resource] > minavailable * 10)
422 			racct->r_resources[resource] = minavailable * 10;
423 
424 		racct->r_resources[resource] -= minavailable;
425 	}
426 }
427 
428 /*
429  * Special version of rctl_get_available() for the %CPU resource.
430  * We slightly cheat here and return less than we normally would.
431  */
432 int64_t
433 rctl_pcpu_available(const struct proc *p) {
434 	struct rctl_rule *rule;
435 	struct rctl_rule_link *link;
436 	int64_t available, minavailable, limit;
437 
438 	ASSERT_RACCT_ENABLED();
439 
440 	minavailable = INT64_MAX;
441 	limit = 0;
442 
443 	RCTL_RLOCK();
444 
445 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
446 		rule = link->rrl_rule;
447 		if (rule->rr_resource != RACCT_PCTCPU)
448 			continue;
449 		if (rule->rr_action != RCTL_ACTION_DENY)
450 			continue;
451 		available = rctl_available_resource(p, rule);
452 		if (available < minavailable) {
453 			minavailable = available;
454 			limit = rule->rr_amount;
455 		}
456 	}
457 
458 	RCTL_RUNLOCK();
459 
460 	/*
461 	 * Return slightly less than actual value of the available
462 	 * %cpu resource.  This makes %cpu throttling more agressive
463 	 * and lets us act sooner than the limits are already exceeded.
464 	 */
465 	if (limit != 0) {
466 		if (limit > 2 * RCTL_PCPU_SHIFT)
467 			minavailable -= RCTL_PCPU_SHIFT;
468 		else
469 			minavailable -= (limit / 2);
470 	}
471 
472 	return (minavailable);
473 }
474 
475 static uint64_t
476 xadd(uint64_t a, uint64_t b)
477 {
478 	uint64_t c;
479 
480 	c = a + b;
481 
482 	/*
483 	 * Detect overflow.
484 	 */
485 	if (c < a || c < b)
486 		return (UINT64_MAX);
487 
488 	return (c);
489 }
490 
491 static uint64_t
492 xmul(uint64_t a, uint64_t b)
493 {
494 
495 	if (b != 0 && a > UINT64_MAX / b)
496 		return (UINT64_MAX);
497 
498 	return (a * b);
499 }
500 
501 /*
502  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
503  * to what it keeps allocated now.  Returns non-zero if the allocation should
504  * be denied, 0 otherwise.
505  */
506 int
507 rctl_enforce(struct proc *p, int resource, uint64_t amount)
508 {
509 	static struct timeval log_lasttime, devctl_lasttime;
510 	static int log_curtime = 0, devctl_curtime = 0;
511 	struct rctl_rule *rule;
512 	struct rctl_rule_link *link;
513 	struct sbuf sb;
514 	char *buf;
515 	int64_t available;
516 	uint64_t sleep_ms, sleep_ratio;
517 	int should_deny = 0;
518 
519 
520 	ASSERT_RACCT_ENABLED();
521 
522 	RCTL_RLOCK();
523 
524 	/*
525 	 * There may be more than one matching rule; go through all of them.
526 	 * Denial should be done last, after logging and sending signals.
527 	 */
528 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
529 		rule = link->rrl_rule;
530 		if (rule->rr_resource != resource)
531 			continue;
532 
533 		available = rctl_available_resource(p, rule);
534 		if (available >= (int64_t)amount) {
535 			link->rrl_exceeded = 0;
536 			continue;
537 		}
538 
539 		switch (rule->rr_action) {
540 		case RCTL_ACTION_DENY:
541 			should_deny = 1;
542 			continue;
543 		case RCTL_ACTION_LOG:
544 			/*
545 			 * If rrl_exceeded != 0, it means we've already
546 			 * logged a warning for this process.
547 			 */
548 			if (link->rrl_exceeded != 0)
549 				continue;
550 
551 			/*
552 			 * If the process state is not fully initialized yet,
553 			 * we can't access most of the required fields, e.g.
554 			 * p->p_comm.  This happens when called from fork1().
555 			 * Ignore this rule for now; it will be processed just
556 			 * after fork, when called from racct_proc_fork_done().
557 			 */
558 			if (p->p_state != PRS_NORMAL)
559 				continue;
560 
561 			if (!ppsratecheck(&log_lasttime, &log_curtime,
562 			    rctl_log_rate_limit))
563 				continue;
564 
565 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
566 			if (buf == NULL) {
567 				printf("rctl_enforce: out of memory\n");
568 				continue;
569 			}
570 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
571 			rctl_rule_to_sbuf(&sb, rule);
572 			sbuf_finish(&sb);
573 			printf("rctl: rule \"%s\" matched by pid %d "
574 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
575 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
576 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
577 			sbuf_delete(&sb);
578 			free(buf, M_RCTL);
579 			link->rrl_exceeded = 1;
580 			continue;
581 		case RCTL_ACTION_DEVCTL:
582 			if (link->rrl_exceeded != 0)
583 				continue;
584 
585 			if (p->p_state != PRS_NORMAL)
586 				continue;
587 
588 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
589 			    rctl_devctl_rate_limit))
590 				continue;
591 
592 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
593 			if (buf == NULL) {
594 				printf("rctl_enforce: out of memory\n");
595 				continue;
596 			}
597 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
598 			sbuf_printf(&sb, "rule=");
599 			rctl_rule_to_sbuf(&sb, rule);
600 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
601 			    p->p_pid, p->p_ucred->cr_ruid,
602 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
603 			sbuf_finish(&sb);
604 			devctl_notify_f("RCTL", "rule", "matched",
605 			    sbuf_data(&sb), M_NOWAIT);
606 			sbuf_delete(&sb);
607 			free(buf, M_RCTL);
608 			link->rrl_exceeded = 1;
609 			continue;
610 		case RCTL_ACTION_THROTTLE:
611 			if (p->p_state != PRS_NORMAL)
612 				continue;
613 
614 			/*
615 			 * Make the process sleep for a fraction of second
616 			 * proportional to the ratio of process' resource
617 			 * utilization compared to the limit.  The point is
618 			 * to penalize resource hogs: processes that consume
619 			 * more of the available resources sleep for longer.
620 			 *
621 			 * We're trying to defer division until the very end,
622 			 * to minimize the rounding effects.  The following
623 			 * calculation could have been written in a clearer
624 			 * way like this:
625 			 *
626 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
627 			 *     rule->rr_amount;
628 			 * sleep_ms *= rctl_throttle_pct / 100;
629 			 * if (sleep_ms < rctl_throttle_min)
630 			 *         sleep_ms = rctl_throttle_min;
631 			 *
632 			 */
633 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
634 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
635 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
636 				sleep_ms = rctl_throttle_min * rule->rr_amount;
637 
638 			/*
639 			 * Multiply that by the ratio of the resource
640 			 * consumption for the container compared to the limit,
641 			 * squared.  In other words, a process in a container
642 			 * that is two times over the limit will be throttled
643 			 * four times as much for hitting the same rule.  The
644 			 * point is to penalize processes more if the container
645 			 * itself (eg certain UID or jail) is above the limit.
646 			 */
647 			if (available < 0)
648 				sleep_ratio = -available / rule->rr_amount;
649 			else
650 				sleep_ratio = 0;
651 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
652 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
653 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
654 
655 			/*
656 			 * Finally the division.
657 			 */
658 			sleep_ms /= rule->rr_amount;
659 
660 			if (sleep_ms > rctl_throttle_max)
661 				sleep_ms = rctl_throttle_max;
662 #if 0
663 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
664 			   __func__, p->p_pid, p->p_comm,
665 			   p->p_racct->r_resources[resource],
666 			   rule->rr_amount, (uintmax_t)sleep_ms,
667 			   (uintmax_t)sleep_ratio, (intmax_t)available);
668 #endif
669 
670 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
671 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
672 			racct_proc_throttle(p, sleep_ms);
673 			continue;
674 		default:
675 			if (link->rrl_exceeded != 0)
676 				continue;
677 
678 			if (p->p_state != PRS_NORMAL)
679 				continue;
680 
681 			KASSERT(rule->rr_action > 0 &&
682 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
683 			    ("rctl_enforce: unknown action %d",
684 			     rule->rr_action));
685 
686 			/*
687 			 * We're using the fact that RCTL_ACTION_SIG* values
688 			 * are equal to their counterparts from sys/signal.h.
689 			 */
690 			kern_psignal(p, rule->rr_action);
691 			link->rrl_exceeded = 1;
692 			continue;
693 		}
694 	}
695 
696 	RCTL_RUNLOCK();
697 
698 	if (should_deny) {
699 		/*
700 		 * Return fake error code; the caller should change it
701 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
702 		 */
703 		return (EDOOFUS);
704 	}
705 
706 	return (0);
707 }
708 
709 uint64_t
710 rctl_get_limit(struct proc *p, int resource)
711 {
712 	struct rctl_rule *rule;
713 	struct rctl_rule_link *link;
714 	uint64_t amount = UINT64_MAX;
715 
716 	ASSERT_RACCT_ENABLED();
717 
718 	RCTL_RLOCK();
719 
720 	/*
721 	 * There may be more than one matching rule; go through all of them.
722 	 * Denial should be done last, after logging and sending signals.
723 	 */
724 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
725 		rule = link->rrl_rule;
726 		if (rule->rr_resource != resource)
727 			continue;
728 		if (rule->rr_action != RCTL_ACTION_DENY)
729 			continue;
730 		if (rule->rr_amount < amount)
731 			amount = rule->rr_amount;
732 	}
733 
734 	RCTL_RUNLOCK();
735 
736 	return (amount);
737 }
738 
739 uint64_t
740 rctl_get_available(struct proc *p, int resource)
741 {
742 	struct rctl_rule *rule;
743 	struct rctl_rule_link *link;
744 	int64_t available, minavailable, allocated;
745 
746 	minavailable = INT64_MAX;
747 
748 	ASSERT_RACCT_ENABLED();
749 
750 	RCTL_RLOCK();
751 
752 	/*
753 	 * There may be more than one matching rule; go through all of them.
754 	 * Denial should be done last, after logging and sending signals.
755 	 */
756 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
757 		rule = link->rrl_rule;
758 		if (rule->rr_resource != resource)
759 			continue;
760 		if (rule->rr_action != RCTL_ACTION_DENY)
761 			continue;
762 		available = rctl_available_resource(p, rule);
763 		if (available < minavailable)
764 			minavailable = available;
765 	}
766 
767 	RCTL_RUNLOCK();
768 
769 	/*
770 	 * XXX: Think about this _hard_.
771 	 */
772 	allocated = p->p_racct->r_resources[resource];
773 	if (minavailable < INT64_MAX - allocated)
774 		minavailable += allocated;
775 	if (minavailable < 0)
776 		minavailable = 0;
777 	return (minavailable);
778 }
779 
780 static int
781 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
782 {
783 
784 	ASSERT_RACCT_ENABLED();
785 
786 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
787 		if (rule->rr_subject_type != filter->rr_subject_type)
788 			return (0);
789 
790 		switch (filter->rr_subject_type) {
791 		case RCTL_SUBJECT_TYPE_PROCESS:
792 			if (filter->rr_subject.rs_proc != NULL &&
793 			    rule->rr_subject.rs_proc !=
794 			    filter->rr_subject.rs_proc)
795 				return (0);
796 			break;
797 		case RCTL_SUBJECT_TYPE_USER:
798 			if (filter->rr_subject.rs_uip != NULL &&
799 			    rule->rr_subject.rs_uip !=
800 			    filter->rr_subject.rs_uip)
801 				return (0);
802 			break;
803 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
804 			if (filter->rr_subject.rs_loginclass != NULL &&
805 			    rule->rr_subject.rs_loginclass !=
806 			    filter->rr_subject.rs_loginclass)
807 				return (0);
808 			break;
809 		case RCTL_SUBJECT_TYPE_JAIL:
810 			if (filter->rr_subject.rs_prison_racct != NULL &&
811 			    rule->rr_subject.rs_prison_racct !=
812 			    filter->rr_subject.rs_prison_racct)
813 				return (0);
814 			break;
815 		default:
816 			panic("rctl_rule_matches: unknown subject type %d",
817 			    filter->rr_subject_type);
818 		}
819 	}
820 
821 	if (filter->rr_resource != RACCT_UNDEFINED) {
822 		if (rule->rr_resource != filter->rr_resource)
823 			return (0);
824 	}
825 
826 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
827 		if (rule->rr_action != filter->rr_action)
828 			return (0);
829 	}
830 
831 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
832 		if (rule->rr_amount != filter->rr_amount)
833 			return (0);
834 	}
835 
836 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
837 		if (rule->rr_per != filter->rr_per)
838 			return (0);
839 	}
840 
841 	return (1);
842 }
843 
844 static int
845 str2value(const char *str, int *value, struct dict *table)
846 {
847 	int i;
848 
849 	if (value == NULL)
850 		return (EINVAL);
851 
852 	for (i = 0; table[i].d_name != NULL; i++) {
853 		if (strcasecmp(table[i].d_name, str) == 0) {
854 			*value =  table[i].d_value;
855 			return (0);
856 		}
857 	}
858 
859 	return (EINVAL);
860 }
861 
862 static int
863 str2id(const char *str, id_t *value)
864 {
865 	char *end;
866 
867 	if (str == NULL)
868 		return (EINVAL);
869 
870 	*value = strtoul(str, &end, 10);
871 	if ((size_t)(end - str) != strlen(str))
872 		return (EINVAL);
873 
874 	return (0);
875 }
876 
877 static int
878 str2int64(const char *str, int64_t *value)
879 {
880 	char *end;
881 
882 	if (str == NULL)
883 		return (EINVAL);
884 
885 	*value = strtoul(str, &end, 10);
886 	if ((size_t)(end - str) != strlen(str))
887 		return (EINVAL);
888 
889 	if (*value < 0)
890 		return (ERANGE);
891 
892 	return (0);
893 }
894 
895 /*
896  * Connect the rule to the racct, increasing refcount for the rule.
897  */
898 static void
899 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
900 {
901 	struct rctl_rule_link *link;
902 
903 	ASSERT_RACCT_ENABLED();
904 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
905 
906 	rctl_rule_acquire(rule);
907 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
908 	link->rrl_rule = rule;
909 	link->rrl_exceeded = 0;
910 
911 	RCTL_WLOCK();
912 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
913 	RCTL_WUNLOCK();
914 }
915 
916 static int
917 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
918 {
919 	struct rctl_rule_link *link;
920 
921 	ASSERT_RACCT_ENABLED();
922 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
923 	RCTL_WLOCK_ASSERT();
924 
925 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
926 	if (link == NULL)
927 		return (ENOMEM);
928 	rctl_rule_acquire(rule);
929 	link->rrl_rule = rule;
930 	link->rrl_exceeded = 0;
931 
932 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
933 	return (0);
934 }
935 
936 /*
937  * Remove limits for a rules matching the filter and release
938  * the refcounts for the rules, possibly freeing them.  Returns
939  * the number of limit structures removed.
940  */
941 static int
942 rctl_racct_remove_rules(struct racct *racct,
943     const struct rctl_rule *filter)
944 {
945 	struct rctl_rule_link *link, *linktmp;
946 	int removed = 0;
947 
948 	ASSERT_RACCT_ENABLED();
949 	RCTL_WLOCK_ASSERT();
950 
951 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
952 		if (!rctl_rule_matches(link->rrl_rule, filter))
953 			continue;
954 
955 		LIST_REMOVE(link, rrl_next);
956 		rctl_rule_release(link->rrl_rule);
957 		uma_zfree(rctl_rule_link_zone, link);
958 		removed++;
959 	}
960 	return (removed);
961 }
962 
963 static void
964 rctl_rule_acquire_subject(struct rctl_rule *rule)
965 {
966 
967 	ASSERT_RACCT_ENABLED();
968 
969 	switch (rule->rr_subject_type) {
970 	case RCTL_SUBJECT_TYPE_UNDEFINED:
971 	case RCTL_SUBJECT_TYPE_PROCESS:
972 		break;
973 	case RCTL_SUBJECT_TYPE_JAIL:
974 		if (rule->rr_subject.rs_prison_racct != NULL)
975 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
976 		break;
977 	case RCTL_SUBJECT_TYPE_USER:
978 		if (rule->rr_subject.rs_uip != NULL)
979 			uihold(rule->rr_subject.rs_uip);
980 		break;
981 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
982 		if (rule->rr_subject.rs_loginclass != NULL)
983 			loginclass_hold(rule->rr_subject.rs_loginclass);
984 		break;
985 	default:
986 		panic("rctl_rule_acquire_subject: unknown subject type %d",
987 		    rule->rr_subject_type);
988 	}
989 }
990 
991 static void
992 rctl_rule_release_subject(struct rctl_rule *rule)
993 {
994 
995 	ASSERT_RACCT_ENABLED();
996 
997 	switch (rule->rr_subject_type) {
998 	case RCTL_SUBJECT_TYPE_UNDEFINED:
999 	case RCTL_SUBJECT_TYPE_PROCESS:
1000 		break;
1001 	case RCTL_SUBJECT_TYPE_JAIL:
1002 		if (rule->rr_subject.rs_prison_racct != NULL)
1003 			prison_racct_free(rule->rr_subject.rs_prison_racct);
1004 		break;
1005 	case RCTL_SUBJECT_TYPE_USER:
1006 		if (rule->rr_subject.rs_uip != NULL)
1007 			uifree(rule->rr_subject.rs_uip);
1008 		break;
1009 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1010 		if (rule->rr_subject.rs_loginclass != NULL)
1011 			loginclass_free(rule->rr_subject.rs_loginclass);
1012 		break;
1013 	default:
1014 		panic("rctl_rule_release_subject: unknown subject type %d",
1015 		    rule->rr_subject_type);
1016 	}
1017 }
1018 
1019 struct rctl_rule *
1020 rctl_rule_alloc(int flags)
1021 {
1022 	struct rctl_rule *rule;
1023 
1024 	ASSERT_RACCT_ENABLED();
1025 
1026 	rule = uma_zalloc(rctl_rule_zone, flags);
1027 	if (rule == NULL)
1028 		return (NULL);
1029 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1030 	rule->rr_subject.rs_proc = NULL;
1031 	rule->rr_subject.rs_uip = NULL;
1032 	rule->rr_subject.rs_loginclass = NULL;
1033 	rule->rr_subject.rs_prison_racct = NULL;
1034 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1035 	rule->rr_resource = RACCT_UNDEFINED;
1036 	rule->rr_action = RCTL_ACTION_UNDEFINED;
1037 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1038 	refcount_init(&rule->rr_refcount, 1);
1039 
1040 	return (rule);
1041 }
1042 
1043 struct rctl_rule *
1044 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1045 {
1046 	struct rctl_rule *copy;
1047 
1048 	ASSERT_RACCT_ENABLED();
1049 
1050 	copy = uma_zalloc(rctl_rule_zone, flags);
1051 	if (copy == NULL)
1052 		return (NULL);
1053 	copy->rr_subject_type = rule->rr_subject_type;
1054 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1055 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1056 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1057 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1058 	copy->rr_per = rule->rr_per;
1059 	copy->rr_resource = rule->rr_resource;
1060 	copy->rr_action = rule->rr_action;
1061 	copy->rr_amount = rule->rr_amount;
1062 	refcount_init(&copy->rr_refcount, 1);
1063 	rctl_rule_acquire_subject(copy);
1064 
1065 	return (copy);
1066 }
1067 
1068 void
1069 rctl_rule_acquire(struct rctl_rule *rule)
1070 {
1071 
1072 	ASSERT_RACCT_ENABLED();
1073 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1074 
1075 	refcount_acquire(&rule->rr_refcount);
1076 }
1077 
1078 static void
1079 rctl_rule_free(void *context, int pending)
1080 {
1081 	struct rctl_rule *rule;
1082 
1083 	rule = (struct rctl_rule *)context;
1084 
1085 	ASSERT_RACCT_ENABLED();
1086 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1087 
1088 	/*
1089 	 * We don't need locking here; rule is guaranteed to be inaccessible.
1090 	 */
1091 
1092 	rctl_rule_release_subject(rule);
1093 	uma_zfree(rctl_rule_zone, rule);
1094 }
1095 
1096 void
1097 rctl_rule_release(struct rctl_rule *rule)
1098 {
1099 
1100 	ASSERT_RACCT_ENABLED();
1101 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1102 
1103 	if (refcount_release(&rule->rr_refcount)) {
1104 		/*
1105 		 * rctl_rule_release() is often called when iterating
1106 		 * over all the uidinfo structures in the system,
1107 		 * holding uihashtbl_lock.  Since rctl_rule_free()
1108 		 * might end up calling uifree(), this would lead
1109 		 * to lock recursion.  Use taskqueue to avoid this.
1110 		 */
1111 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1112 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1113 	}
1114 }
1115 
1116 static int
1117 rctl_rule_fully_specified(const struct rctl_rule *rule)
1118 {
1119 
1120 	ASSERT_RACCT_ENABLED();
1121 
1122 	switch (rule->rr_subject_type) {
1123 	case RCTL_SUBJECT_TYPE_UNDEFINED:
1124 		return (0);
1125 	case RCTL_SUBJECT_TYPE_PROCESS:
1126 		if (rule->rr_subject.rs_proc == NULL)
1127 			return (0);
1128 		break;
1129 	case RCTL_SUBJECT_TYPE_USER:
1130 		if (rule->rr_subject.rs_uip == NULL)
1131 			return (0);
1132 		break;
1133 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1134 		if (rule->rr_subject.rs_loginclass == NULL)
1135 			return (0);
1136 		break;
1137 	case RCTL_SUBJECT_TYPE_JAIL:
1138 		if (rule->rr_subject.rs_prison_racct == NULL)
1139 			return (0);
1140 		break;
1141 	default:
1142 		panic("rctl_rule_fully_specified: unknown subject type %d",
1143 		    rule->rr_subject_type);
1144 	}
1145 	if (rule->rr_resource == RACCT_UNDEFINED)
1146 		return (0);
1147 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1148 		return (0);
1149 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1150 		return (0);
1151 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1152 		return (0);
1153 
1154 	return (1);
1155 }
1156 
1157 static int
1158 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1159 {
1160 	struct rctl_rule *rule;
1161 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1162 	     *amountstr, *perstr;
1163 	id_t id;
1164 	int error = 0;
1165 
1166 	ASSERT_RACCT_ENABLED();
1167 
1168 	rule = rctl_rule_alloc(M_WAITOK);
1169 
1170 	subjectstr = strsep(&rulestr, ":");
1171 	subject_idstr = strsep(&rulestr, ":");
1172 	resourcestr = strsep(&rulestr, ":");
1173 	actionstr = strsep(&rulestr, "=/");
1174 	amountstr = strsep(&rulestr, "/");
1175 	perstr = rulestr;
1176 
1177 	if (subjectstr == NULL || subjectstr[0] == '\0')
1178 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1179 	else {
1180 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1181 		if (error != 0)
1182 			goto out;
1183 	}
1184 
1185 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1186 		rule->rr_subject.rs_proc = NULL;
1187 		rule->rr_subject.rs_uip = NULL;
1188 		rule->rr_subject.rs_loginclass = NULL;
1189 		rule->rr_subject.rs_prison_racct = NULL;
1190 	} else {
1191 		switch (rule->rr_subject_type) {
1192 		case RCTL_SUBJECT_TYPE_UNDEFINED:
1193 			error = EINVAL;
1194 			goto out;
1195 		case RCTL_SUBJECT_TYPE_PROCESS:
1196 			error = str2id(subject_idstr, &id);
1197 			if (error != 0)
1198 				goto out;
1199 			sx_assert(&allproc_lock, SA_LOCKED);
1200 			rule->rr_subject.rs_proc = pfind(id);
1201 			if (rule->rr_subject.rs_proc == NULL) {
1202 				error = ESRCH;
1203 				goto out;
1204 			}
1205 			PROC_UNLOCK(rule->rr_subject.rs_proc);
1206 			break;
1207 		case RCTL_SUBJECT_TYPE_USER:
1208 			error = str2id(subject_idstr, &id);
1209 			if (error != 0)
1210 				goto out;
1211 			rule->rr_subject.rs_uip = uifind(id);
1212 			break;
1213 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1214 			rule->rr_subject.rs_loginclass =
1215 			    loginclass_find(subject_idstr);
1216 			if (rule->rr_subject.rs_loginclass == NULL) {
1217 				error = ENAMETOOLONG;
1218 				goto out;
1219 			}
1220 			break;
1221 		case RCTL_SUBJECT_TYPE_JAIL:
1222 			rule->rr_subject.rs_prison_racct =
1223 			    prison_racct_find(subject_idstr);
1224 			if (rule->rr_subject.rs_prison_racct == NULL) {
1225 				error = ENAMETOOLONG;
1226 				goto out;
1227 			}
1228 			break;
1229                default:
1230                        panic("rctl_string_to_rule: unknown subject type %d",
1231                            rule->rr_subject_type);
1232                }
1233 	}
1234 
1235 	if (resourcestr == NULL || resourcestr[0] == '\0')
1236 		rule->rr_resource = RACCT_UNDEFINED;
1237 	else {
1238 		error = str2value(resourcestr, &rule->rr_resource,
1239 		    resourcenames);
1240 		if (error != 0)
1241 			goto out;
1242 	}
1243 
1244 	if (actionstr == NULL || actionstr[0] == '\0')
1245 		rule->rr_action = RCTL_ACTION_UNDEFINED;
1246 	else {
1247 		error = str2value(actionstr, &rule->rr_action, actionnames);
1248 		if (error != 0)
1249 			goto out;
1250 	}
1251 
1252 	if (amountstr == NULL || amountstr[0] == '\0')
1253 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1254 	else {
1255 		error = str2int64(amountstr, &rule->rr_amount);
1256 		if (error != 0)
1257 			goto out;
1258 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1259 			if (rule->rr_amount > INT64_MAX / 1000000) {
1260 				error = ERANGE;
1261 				goto out;
1262 			}
1263 			rule->rr_amount *= 1000000;
1264 		}
1265 	}
1266 
1267 	if (perstr == NULL || perstr[0] == '\0')
1268 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1269 	else {
1270 		error = str2value(perstr, &rule->rr_per, subjectnames);
1271 		if (error != 0)
1272 			goto out;
1273 	}
1274 
1275 out:
1276 	if (error == 0)
1277 		*rulep = rule;
1278 	else
1279 		rctl_rule_release(rule);
1280 
1281 	return (error);
1282 }
1283 
1284 /*
1285  * Link a rule with all the subjects it applies to.
1286  */
1287 int
1288 rctl_rule_add(struct rctl_rule *rule)
1289 {
1290 	struct proc *p;
1291 	struct ucred *cred;
1292 	struct uidinfo *uip;
1293 	struct prison *pr;
1294 	struct prison_racct *prr;
1295 	struct loginclass *lc;
1296 	struct rctl_rule *rule2;
1297 	int match;
1298 
1299 	ASSERT_RACCT_ENABLED();
1300 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1301 
1302 	/*
1303 	 * Some rules just don't make sense, like "deny" rule for an undeniable
1304 	 * resource.  The exception are the RSS and %CPU resources - they are
1305 	 * not deniable in the racct sense, but the limit is enforced in
1306 	 * a different way.
1307 	 */
1308 	if (rule->rr_action == RCTL_ACTION_DENY &&
1309 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1310 	    rule->rr_resource != RACCT_RSS &&
1311 	    rule->rr_resource != RACCT_PCTCPU) {
1312 		return (EOPNOTSUPP);
1313 	}
1314 
1315 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1316 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1317 		return (EOPNOTSUPP);
1318 	}
1319 
1320 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1321 	    rule->rr_resource == RACCT_PCTCPU) {
1322 		return (EOPNOTSUPP);
1323 	}
1324 
1325 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1326 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1327 		return (EOPNOTSUPP);
1328 	}
1329 
1330 	/*
1331 	 * Make sure there are no duplicated rules.  Also, for the "deny"
1332 	 * rules, remove ones differing only by "amount".
1333 	 */
1334 	if (rule->rr_action == RCTL_ACTION_DENY) {
1335 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1336 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1337 		rctl_rule_remove(rule2);
1338 		rctl_rule_release(rule2);
1339 	} else
1340 		rctl_rule_remove(rule);
1341 
1342 	switch (rule->rr_subject_type) {
1343 	case RCTL_SUBJECT_TYPE_PROCESS:
1344 		p = rule->rr_subject.rs_proc;
1345 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1346 
1347 		rctl_racct_add_rule(p->p_racct, rule);
1348 		/*
1349 		 * In case of per-process rule, we don't have anything more
1350 		 * to do.
1351 		 */
1352 		return (0);
1353 
1354 	case RCTL_SUBJECT_TYPE_USER:
1355 		uip = rule->rr_subject.rs_uip;
1356 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1357 		rctl_racct_add_rule(uip->ui_racct, rule);
1358 		break;
1359 
1360 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1361 		lc = rule->rr_subject.rs_loginclass;
1362 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1363 		rctl_racct_add_rule(lc->lc_racct, rule);
1364 		break;
1365 
1366 	case RCTL_SUBJECT_TYPE_JAIL:
1367 		prr = rule->rr_subject.rs_prison_racct;
1368 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1369 		rctl_racct_add_rule(prr->prr_racct, rule);
1370 		break;
1371 
1372 	default:
1373 		panic("rctl_rule_add: unknown subject type %d",
1374 		    rule->rr_subject_type);
1375 	}
1376 
1377 	/*
1378 	 * Now go through all the processes and add the new rule to the ones
1379 	 * it applies to.
1380 	 */
1381 	sx_assert(&allproc_lock, SA_LOCKED);
1382 	FOREACH_PROC_IN_SYSTEM(p) {
1383 		cred = p->p_ucred;
1384 		switch (rule->rr_subject_type) {
1385 		case RCTL_SUBJECT_TYPE_USER:
1386 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1387 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1388 				break;
1389 			continue;
1390 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1391 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1392 				break;
1393 			continue;
1394 		case RCTL_SUBJECT_TYPE_JAIL:
1395 			match = 0;
1396 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1397 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1398 					match = 1;
1399 					break;
1400 				}
1401 			}
1402 			if (match)
1403 				break;
1404 			continue;
1405 		default:
1406 			panic("rctl_rule_add: unknown subject type %d",
1407 			    rule->rr_subject_type);
1408 		}
1409 
1410 		rctl_racct_add_rule(p->p_racct, rule);
1411 	}
1412 
1413 	return (0);
1414 }
1415 
1416 static void
1417 rctl_rule_pre_callback(void)
1418 {
1419 
1420 	RCTL_WLOCK();
1421 }
1422 
1423 static void
1424 rctl_rule_post_callback(void)
1425 {
1426 
1427 	RCTL_WUNLOCK();
1428 }
1429 
1430 static void
1431 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1432 {
1433 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1434 	int found = 0;
1435 
1436 	ASSERT_RACCT_ENABLED();
1437 	RCTL_WLOCK_ASSERT();
1438 
1439 	found += rctl_racct_remove_rules(racct, filter);
1440 
1441 	*((int *)arg3) += found;
1442 }
1443 
1444 /*
1445  * Remove all rules that match the filter.
1446  */
1447 int
1448 rctl_rule_remove(struct rctl_rule *filter)
1449 {
1450 	struct proc *p;
1451 	int found = 0;
1452 
1453 	ASSERT_RACCT_ENABLED();
1454 
1455 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1456 	    filter->rr_subject.rs_proc != NULL) {
1457 		p = filter->rr_subject.rs_proc;
1458 		RCTL_WLOCK();
1459 		found = rctl_racct_remove_rules(p->p_racct, filter);
1460 		RCTL_WUNLOCK();
1461 		if (found)
1462 			return (0);
1463 		return (ESRCH);
1464 	}
1465 
1466 	loginclass_racct_foreach(rctl_rule_remove_callback,
1467 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1468 	    filter, (void *)&found);
1469 	ui_racct_foreach(rctl_rule_remove_callback,
1470 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1471 	    filter, (void *)&found);
1472 	prison_racct_foreach(rctl_rule_remove_callback,
1473 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1474 	    filter, (void *)&found);
1475 
1476 	sx_assert(&allproc_lock, SA_LOCKED);
1477 	RCTL_WLOCK();
1478 	FOREACH_PROC_IN_SYSTEM(p) {
1479 		found += rctl_racct_remove_rules(p->p_racct, filter);
1480 	}
1481 	RCTL_WUNLOCK();
1482 
1483 	if (found)
1484 		return (0);
1485 	return (ESRCH);
1486 }
1487 
1488 /*
1489  * Appends a rule to the sbuf.
1490  */
1491 static void
1492 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1493 {
1494 	int64_t amount;
1495 
1496 	ASSERT_RACCT_ENABLED();
1497 
1498 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1499 
1500 	switch (rule->rr_subject_type) {
1501 	case RCTL_SUBJECT_TYPE_PROCESS:
1502 		if (rule->rr_subject.rs_proc == NULL)
1503 			sbuf_printf(sb, ":");
1504 		else
1505 			sbuf_printf(sb, "%d:",
1506 			    rule->rr_subject.rs_proc->p_pid);
1507 		break;
1508 	case RCTL_SUBJECT_TYPE_USER:
1509 		if (rule->rr_subject.rs_uip == NULL)
1510 			sbuf_printf(sb, ":");
1511 		else
1512 			sbuf_printf(sb, "%d:",
1513 			    rule->rr_subject.rs_uip->ui_uid);
1514 		break;
1515 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1516 		if (rule->rr_subject.rs_loginclass == NULL)
1517 			sbuf_printf(sb, ":");
1518 		else
1519 			sbuf_printf(sb, "%s:",
1520 			    rule->rr_subject.rs_loginclass->lc_name);
1521 		break;
1522 	case RCTL_SUBJECT_TYPE_JAIL:
1523 		if (rule->rr_subject.rs_prison_racct == NULL)
1524 			sbuf_printf(sb, ":");
1525 		else
1526 			sbuf_printf(sb, "%s:",
1527 			    rule->rr_subject.rs_prison_racct->prr_name);
1528 		break;
1529 	default:
1530 		panic("rctl_rule_to_sbuf: unknown subject type %d",
1531 		    rule->rr_subject_type);
1532 	}
1533 
1534 	amount = rule->rr_amount;
1535 	if (amount != RCTL_AMOUNT_UNDEFINED &&
1536 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1537 		amount /= 1000000;
1538 
1539 	sbuf_printf(sb, "%s:%s=%jd",
1540 	    rctl_resource_name(rule->rr_resource),
1541 	    rctl_action_name(rule->rr_action),
1542 	    amount);
1543 
1544 	if (rule->rr_per != rule->rr_subject_type)
1545 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1546 }
1547 
1548 /*
1549  * Routine used by RCTL syscalls to read in input string.
1550  */
1551 static int
1552 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1553 {
1554 	char *str;
1555 	int error;
1556 
1557 	ASSERT_RACCT_ENABLED();
1558 
1559 	if (inbuflen <= 0)
1560 		return (EINVAL);
1561 	if (inbuflen > RCTL_MAX_INBUFSIZE)
1562 		return (E2BIG);
1563 
1564 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1565 	error = copyinstr(inbufp, str, inbuflen, NULL);
1566 	if (error != 0) {
1567 		free(str, M_RCTL);
1568 		return (error);
1569 	}
1570 
1571 	*inputstr = str;
1572 
1573 	return (0);
1574 }
1575 
1576 /*
1577  * Routine used by RCTL syscalls to write out output string.
1578  */
1579 static int
1580 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1581 {
1582 	int error;
1583 
1584 	ASSERT_RACCT_ENABLED();
1585 
1586 	if (outputsbuf == NULL)
1587 		return (0);
1588 
1589 	sbuf_finish(outputsbuf);
1590 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1591 		sbuf_delete(outputsbuf);
1592 		return (ERANGE);
1593 	}
1594 	error = copyout(sbuf_data(outputsbuf), outbufp,
1595 	    sbuf_len(outputsbuf) + 1);
1596 	sbuf_delete(outputsbuf);
1597 	return (error);
1598 }
1599 
1600 static struct sbuf *
1601 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1602 {
1603 	struct sbuf *sb;
1604 	int64_t amount;
1605 	int i;
1606 
1607 	ASSERT_RACCT_ENABLED();
1608 
1609 	sb = sbuf_new_auto();
1610 	for (i = 0; i <= RACCT_MAX; i++) {
1611 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1612 			continue;
1613 		amount = racct->r_resources[i];
1614 		if (RACCT_IS_IN_MILLIONS(i))
1615 			amount /= 1000000;
1616 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1617 	}
1618 	sbuf_setpos(sb, sbuf_len(sb) - 1);
1619 	return (sb);
1620 }
1621 
1622 int
1623 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1624 {
1625 	struct rctl_rule *filter;
1626 	struct sbuf *outputsbuf = NULL;
1627 	struct proc *p;
1628 	struct uidinfo *uip;
1629 	struct loginclass *lc;
1630 	struct prison_racct *prr;
1631 	char *inputstr;
1632 	int error;
1633 
1634 	if (!racct_enable)
1635 		return (ENOSYS);
1636 
1637 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1638 	if (error != 0)
1639 		return (error);
1640 
1641 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1642 	if (error != 0)
1643 		return (error);
1644 
1645 	sx_slock(&allproc_lock);
1646 	error = rctl_string_to_rule(inputstr, &filter);
1647 	free(inputstr, M_RCTL);
1648 	if (error != 0) {
1649 		sx_sunlock(&allproc_lock);
1650 		return (error);
1651 	}
1652 
1653 	switch (filter->rr_subject_type) {
1654 	case RCTL_SUBJECT_TYPE_PROCESS:
1655 		p = filter->rr_subject.rs_proc;
1656 		if (p == NULL) {
1657 			error = EINVAL;
1658 			goto out;
1659 		}
1660 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1661 		break;
1662 	case RCTL_SUBJECT_TYPE_USER:
1663 		uip = filter->rr_subject.rs_uip;
1664 		if (uip == NULL) {
1665 			error = EINVAL;
1666 			goto out;
1667 		}
1668 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1669 		break;
1670 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1671 		lc = filter->rr_subject.rs_loginclass;
1672 		if (lc == NULL) {
1673 			error = EINVAL;
1674 			goto out;
1675 		}
1676 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1677 		break;
1678 	case RCTL_SUBJECT_TYPE_JAIL:
1679 		prr = filter->rr_subject.rs_prison_racct;
1680 		if (prr == NULL) {
1681 			error = EINVAL;
1682 			goto out;
1683 		}
1684 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1685 		break;
1686 	default:
1687 		error = EINVAL;
1688 	}
1689 out:
1690 	rctl_rule_release(filter);
1691 	sx_sunlock(&allproc_lock);
1692 	if (error != 0)
1693 		return (error);
1694 
1695 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1696 
1697 	return (error);
1698 }
1699 
1700 static void
1701 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1702 {
1703 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1704 	struct rctl_rule_link *link;
1705 	struct sbuf *sb = (struct sbuf *)arg3;
1706 
1707 	ASSERT_RACCT_ENABLED();
1708 	RCTL_LOCK_ASSERT();
1709 
1710 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1711 		if (!rctl_rule_matches(link->rrl_rule, filter))
1712 			continue;
1713 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1714 		sbuf_printf(sb, ",");
1715 	}
1716 }
1717 
1718 int
1719 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1720 {
1721 	struct sbuf *sb;
1722 	struct rctl_rule *filter;
1723 	struct rctl_rule_link *link;
1724 	struct proc *p;
1725 	char *inputstr, *buf;
1726 	size_t bufsize;
1727 	int error;
1728 
1729 	if (!racct_enable)
1730 		return (ENOSYS);
1731 
1732 	error = priv_check(td, PRIV_RCTL_GET_RULES);
1733 	if (error != 0)
1734 		return (error);
1735 
1736 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1737 	if (error != 0)
1738 		return (error);
1739 
1740 	sx_slock(&allproc_lock);
1741 	error = rctl_string_to_rule(inputstr, &filter);
1742 	free(inputstr, M_RCTL);
1743 	if (error != 0) {
1744 		sx_sunlock(&allproc_lock);
1745 		return (error);
1746 	}
1747 
1748 	bufsize = uap->outbuflen;
1749 	if (bufsize > rctl_maxbufsize) {
1750 		sx_sunlock(&allproc_lock);
1751 		return (E2BIG);
1752 	}
1753 
1754 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1755 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1756 	KASSERT(sb != NULL, ("sbuf_new failed"));
1757 
1758 	FOREACH_PROC_IN_SYSTEM(p) {
1759 		RCTL_RLOCK();
1760 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1761 			/*
1762 			 * Non-process rules will be added to the buffer later.
1763 			 * Adding them here would result in duplicated output.
1764 			 */
1765 			if (link->rrl_rule->rr_subject_type !=
1766 			    RCTL_SUBJECT_TYPE_PROCESS)
1767 				continue;
1768 			if (!rctl_rule_matches(link->rrl_rule, filter))
1769 				continue;
1770 			rctl_rule_to_sbuf(sb, link->rrl_rule);
1771 			sbuf_printf(sb, ",");
1772 		}
1773 		RCTL_RUNLOCK();
1774 	}
1775 
1776 	loginclass_racct_foreach(rctl_get_rules_callback,
1777 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1778 	    filter, sb);
1779 	ui_racct_foreach(rctl_get_rules_callback,
1780 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1781 	    filter, sb);
1782 	prison_racct_foreach(rctl_get_rules_callback,
1783 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1784 	    filter, sb);
1785 	if (sbuf_error(sb) == ENOMEM) {
1786 		error = ERANGE;
1787 		goto out;
1788 	}
1789 
1790 	/*
1791 	 * Remove trailing ",".
1792 	 */
1793 	if (sbuf_len(sb) > 0)
1794 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1795 
1796 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1797 out:
1798 	rctl_rule_release(filter);
1799 	sx_sunlock(&allproc_lock);
1800 	free(buf, M_RCTL);
1801 	return (error);
1802 }
1803 
1804 int
1805 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1806 {
1807 	struct sbuf *sb;
1808 	struct rctl_rule *filter;
1809 	struct rctl_rule_link *link;
1810 	char *inputstr, *buf;
1811 	size_t bufsize;
1812 	int error;
1813 
1814 	if (!racct_enable)
1815 		return (ENOSYS);
1816 
1817 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1818 	if (error != 0)
1819 		return (error);
1820 
1821 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1822 	if (error != 0)
1823 		return (error);
1824 
1825 	sx_slock(&allproc_lock);
1826 	error = rctl_string_to_rule(inputstr, &filter);
1827 	free(inputstr, M_RCTL);
1828 	if (error != 0) {
1829 		sx_sunlock(&allproc_lock);
1830 		return (error);
1831 	}
1832 
1833 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1834 		rctl_rule_release(filter);
1835 		sx_sunlock(&allproc_lock);
1836 		return (EINVAL);
1837 	}
1838 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1839 		rctl_rule_release(filter);
1840 		sx_sunlock(&allproc_lock);
1841 		return (EOPNOTSUPP);
1842 	}
1843 	if (filter->rr_subject.rs_proc == NULL) {
1844 		rctl_rule_release(filter);
1845 		sx_sunlock(&allproc_lock);
1846 		return (EINVAL);
1847 	}
1848 
1849 	bufsize = uap->outbuflen;
1850 	if (bufsize > rctl_maxbufsize) {
1851 		rctl_rule_release(filter);
1852 		sx_sunlock(&allproc_lock);
1853 		return (E2BIG);
1854 	}
1855 
1856 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1857 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1858 	KASSERT(sb != NULL, ("sbuf_new failed"));
1859 
1860 	RCTL_RLOCK();
1861 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1862 	    rrl_next) {
1863 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1864 		sbuf_printf(sb, ",");
1865 	}
1866 	RCTL_RUNLOCK();
1867 	if (sbuf_error(sb) == ENOMEM) {
1868 		error = ERANGE;
1869 		sbuf_delete(sb);
1870 		goto out;
1871 	}
1872 
1873 	/*
1874 	 * Remove trailing ",".
1875 	 */
1876 	if (sbuf_len(sb) > 0)
1877 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1878 
1879 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1880 out:
1881 	rctl_rule_release(filter);
1882 	sx_sunlock(&allproc_lock);
1883 	free(buf, M_RCTL);
1884 	return (error);
1885 }
1886 
1887 int
1888 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1889 {
1890 	struct rctl_rule *rule;
1891 	char *inputstr;
1892 	int error;
1893 
1894 	if (!racct_enable)
1895 		return (ENOSYS);
1896 
1897 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1898 	if (error != 0)
1899 		return (error);
1900 
1901 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1902 	if (error != 0)
1903 		return (error);
1904 
1905 	sx_slock(&allproc_lock);
1906 	error = rctl_string_to_rule(inputstr, &rule);
1907 	free(inputstr, M_RCTL);
1908 	if (error != 0) {
1909 		sx_sunlock(&allproc_lock);
1910 		return (error);
1911 	}
1912 	/*
1913 	 * The 'per' part of a rule is optional.
1914 	 */
1915 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1916 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1917 		rule->rr_per = rule->rr_subject_type;
1918 
1919 	if (!rctl_rule_fully_specified(rule)) {
1920 		error = EINVAL;
1921 		goto out;
1922 	}
1923 
1924 	error = rctl_rule_add(rule);
1925 
1926 out:
1927 	rctl_rule_release(rule);
1928 	sx_sunlock(&allproc_lock);
1929 	return (error);
1930 }
1931 
1932 int
1933 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1934 {
1935 	struct rctl_rule *filter;
1936 	char *inputstr;
1937 	int error;
1938 
1939 	if (!racct_enable)
1940 		return (ENOSYS);
1941 
1942 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1943 	if (error != 0)
1944 		return (error);
1945 
1946 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1947 	if (error != 0)
1948 		return (error);
1949 
1950 	sx_slock(&allproc_lock);
1951 	error = rctl_string_to_rule(inputstr, &filter);
1952 	free(inputstr, M_RCTL);
1953 	if (error != 0) {
1954 		sx_sunlock(&allproc_lock);
1955 		return (error);
1956 	}
1957 
1958 	error = rctl_rule_remove(filter);
1959 	rctl_rule_release(filter);
1960 	sx_sunlock(&allproc_lock);
1961 
1962 	return (error);
1963 }
1964 
1965 /*
1966  * Update RCTL rule list after credential change.
1967  */
1968 void
1969 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1970 {
1971 	LIST_HEAD(, rctl_rule_link) newrules;
1972 	struct rctl_rule_link *link, *newlink;
1973 	struct uidinfo *newuip;
1974 	struct loginclass *newlc;
1975 	struct prison_racct *newprr;
1976 	int rulecnt, i;
1977 
1978 	ASSERT_RACCT_ENABLED();
1979 
1980 	newuip = newcred->cr_ruidinfo;
1981 	newlc = newcred->cr_loginclass;
1982 	newprr = newcred->cr_prison->pr_prison_racct;
1983 
1984 	LIST_INIT(&newrules);
1985 
1986 again:
1987 	/*
1988 	 * First, count the rules that apply to the process with new
1989 	 * credentials.
1990 	 */
1991 	rulecnt = 0;
1992 	RCTL_RLOCK();
1993 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1994 		if (link->rrl_rule->rr_subject_type ==
1995 		    RCTL_SUBJECT_TYPE_PROCESS)
1996 			rulecnt++;
1997 	}
1998 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1999 		rulecnt++;
2000 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
2001 		rulecnt++;
2002 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
2003 		rulecnt++;
2004 	RCTL_RUNLOCK();
2005 
2006 	/*
2007 	 * Create temporary list.  We've dropped the rctl_lock in order
2008 	 * to use M_WAITOK.
2009 	 */
2010 	for (i = 0; i < rulecnt; i++) {
2011 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2012 		newlink->rrl_rule = NULL;
2013 		newlink->rrl_exceeded = 0;
2014 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2015 	}
2016 
2017 	newlink = LIST_FIRST(&newrules);
2018 
2019 	/*
2020 	 * Assign rules to the newly allocated list entries.
2021 	 */
2022 	RCTL_WLOCK();
2023 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2024 		if (link->rrl_rule->rr_subject_type ==
2025 		    RCTL_SUBJECT_TYPE_PROCESS) {
2026 			if (newlink == NULL)
2027 				goto goaround;
2028 			rctl_rule_acquire(link->rrl_rule);
2029 			newlink->rrl_rule = link->rrl_rule;
2030 			newlink->rrl_exceeded = link->rrl_exceeded;
2031 			newlink = LIST_NEXT(newlink, rrl_next);
2032 			rulecnt--;
2033 		}
2034 	}
2035 
2036 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2037 		if (newlink == NULL)
2038 			goto goaround;
2039 		rctl_rule_acquire(link->rrl_rule);
2040 		newlink->rrl_rule = link->rrl_rule;
2041 		newlink->rrl_exceeded = link->rrl_exceeded;
2042 		newlink = LIST_NEXT(newlink, rrl_next);
2043 		rulecnt--;
2044 	}
2045 
2046 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2047 		if (newlink == NULL)
2048 			goto goaround;
2049 		rctl_rule_acquire(link->rrl_rule);
2050 		newlink->rrl_rule = link->rrl_rule;
2051 		newlink->rrl_exceeded = link->rrl_exceeded;
2052 		newlink = LIST_NEXT(newlink, rrl_next);
2053 		rulecnt--;
2054 	}
2055 
2056 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2057 		if (newlink == NULL)
2058 			goto goaround;
2059 		rctl_rule_acquire(link->rrl_rule);
2060 		newlink->rrl_rule = link->rrl_rule;
2061 		newlink->rrl_exceeded = link->rrl_exceeded;
2062 		newlink = LIST_NEXT(newlink, rrl_next);
2063 		rulecnt--;
2064 	}
2065 
2066 	if (rulecnt == 0) {
2067 		/*
2068 		 * Free the old rule list.
2069 		 */
2070 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2071 			link = LIST_FIRST(&p->p_racct->r_rule_links);
2072 			LIST_REMOVE(link, rrl_next);
2073 			rctl_rule_release(link->rrl_rule);
2074 			uma_zfree(rctl_rule_link_zone, link);
2075 		}
2076 
2077 		/*
2078 		 * Replace lists and we're done.
2079 		 *
2080 		 * XXX: Is there any way to switch list heads instead
2081 		 *      of iterating here?
2082 		 */
2083 		while (!LIST_EMPTY(&newrules)) {
2084 			newlink = LIST_FIRST(&newrules);
2085 			LIST_REMOVE(newlink, rrl_next);
2086 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2087 			    newlink, rrl_next);
2088 		}
2089 
2090 		RCTL_WUNLOCK();
2091 
2092 		return;
2093 	}
2094 
2095 goaround:
2096 	RCTL_WUNLOCK();
2097 
2098 	/*
2099 	 * Rule list changed while we were not holding the rctl_lock.
2100 	 * Free the new list and try again.
2101 	 */
2102 	while (!LIST_EMPTY(&newrules)) {
2103 		newlink = LIST_FIRST(&newrules);
2104 		LIST_REMOVE(newlink, rrl_next);
2105 		if (newlink->rrl_rule != NULL)
2106 			rctl_rule_release(newlink->rrl_rule);
2107 		uma_zfree(rctl_rule_link_zone, newlink);
2108 	}
2109 
2110 	goto again;
2111 }
2112 
2113 /*
2114  * Assign RCTL rules to the newly created process.
2115  */
2116 int
2117 rctl_proc_fork(struct proc *parent, struct proc *child)
2118 {
2119 	struct rctl_rule *rule;
2120 	struct rctl_rule_link *link;
2121 	int error;
2122 
2123 	LIST_INIT(&child->p_racct->r_rule_links);
2124 
2125 	ASSERT_RACCT_ENABLED();
2126 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2127 
2128 	RCTL_WLOCK();
2129 
2130 	/*
2131 	 * Go through limits applicable to the parent and assign them
2132 	 * to the child.  Rules with 'process' subject have to be duplicated
2133 	 * in order to make their rr_subject point to the new process.
2134 	 */
2135 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2136 		if (link->rrl_rule->rr_subject_type ==
2137 		    RCTL_SUBJECT_TYPE_PROCESS) {
2138 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2139 			if (rule == NULL)
2140 				goto fail;
2141 			KASSERT(rule->rr_subject.rs_proc == parent,
2142 			    ("rule->rr_subject.rs_proc != parent"));
2143 			rule->rr_subject.rs_proc = child;
2144 			error = rctl_racct_add_rule_locked(child->p_racct,
2145 			    rule);
2146 			rctl_rule_release(rule);
2147 			if (error != 0)
2148 				goto fail;
2149 		} else {
2150 			error = rctl_racct_add_rule_locked(child->p_racct,
2151 			    link->rrl_rule);
2152 			if (error != 0)
2153 				goto fail;
2154 		}
2155 	}
2156 
2157 	RCTL_WUNLOCK();
2158 	return (0);
2159 
2160 fail:
2161 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2162 		link = LIST_FIRST(&child->p_racct->r_rule_links);
2163 		LIST_REMOVE(link, rrl_next);
2164 		rctl_rule_release(link->rrl_rule);
2165 		uma_zfree(rctl_rule_link_zone, link);
2166 	}
2167 	RCTL_WUNLOCK();
2168 	return (EAGAIN);
2169 }
2170 
2171 /*
2172  * Release rules attached to the racct.
2173  */
2174 void
2175 rctl_racct_release(struct racct *racct)
2176 {
2177 	struct rctl_rule_link *link;
2178 
2179 	ASSERT_RACCT_ENABLED();
2180 
2181 	RCTL_WLOCK();
2182 	while (!LIST_EMPTY(&racct->r_rule_links)) {
2183 		link = LIST_FIRST(&racct->r_rule_links);
2184 		LIST_REMOVE(link, rrl_next);
2185 		rctl_rule_release(link->rrl_rule);
2186 		uma_zfree(rctl_rule_link_zone, link);
2187 	}
2188 	RCTL_WUNLOCK();
2189 }
2190 
2191 static void
2192 rctl_init(void)
2193 {
2194 
2195 	if (!racct_enable)
2196 		return;
2197 
2198 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2199 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2200 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2201 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2202 	    UMA_ALIGN_PTR, 0);
2203 
2204 	/*
2205 	 * Set default values, making sure not to overwrite the ones
2206 	 * fetched from tunables.  Most of those could be set at the
2207 	 * declaration, except for the rctl_throttle_max - we cannot
2208 	 * set it there due to hz not being compile time constant.
2209 	 */
2210 	if (rctl_throttle_min < 1)
2211 		rctl_throttle_min = 1;
2212 	if (rctl_throttle_max < rctl_throttle_min)
2213 		rctl_throttle_max = 2 * hz;
2214 	if (rctl_throttle_pct < 0)
2215 		rctl_throttle_pct = 100;
2216 	if (rctl_throttle_pct2 < 0)
2217 		rctl_throttle_pct2 = 100;
2218 }
2219 
2220 #else /* !RCTL */
2221 
2222 int
2223 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2224 {
2225 
2226 	return (ENOSYS);
2227 }
2228 
2229 int
2230 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2231 {
2232 
2233 	return (ENOSYS);
2234 }
2235 
2236 int
2237 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2238 {
2239 
2240 	return (ENOSYS);
2241 }
2242 
2243 int
2244 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2245 {
2246 
2247 	return (ENOSYS);
2248 }
2249 
2250 int
2251 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2252 {
2253 
2254 	return (ENOSYS);
2255 }
2256 
2257 #endif /* !RCTL */
2258