xref: /freebsd/sys/kern/kern_rctl.c (revision 94086cea279d930eb2fbe7d680585abde7e9c095)
1 /*-
2  * Copyright (c) 2010 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/malloc.h>
38 #include <sys/queue.h>
39 #include <sys/refcount.h>
40 #include <sys/jail.h>
41 #include <sys/kernel.h>
42 #include <sys/limits.h>
43 #include <sys/loginclass.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/racct.h>
47 #include <sys/rctl.h>
48 #include <sys/resourcevar.h>
49 #include <sys/sx.h>
50 #include <sys/sysent.h>
51 #include <sys/sysproto.h>
52 #include <sys/systm.h>
53 #include <sys/types.h>
54 #include <sys/eventhandler.h>
55 #include <sys/lock.h>
56 #include <sys/mutex.h>
57 #include <sys/rwlock.h>
58 #include <sys/sbuf.h>
59 #include <sys/taskqueue.h>
60 #include <sys/tree.h>
61 #include <vm/uma.h>
62 
63 #ifdef RCTL
64 #ifndef RACCT
65 #error "The RCTL option requires the RACCT option"
66 #endif
67 
68 FEATURE(rctl, "Resource Limits");
69 
70 #define	HRF_DEFAULT		0
71 #define	HRF_DONT_INHERIT	1
72 #define	HRF_DONT_ACCUMULATE	2
73 
74 #define	RCTL_MAX_INBUFSIZE	4 * 1024
75 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
76 #define	RCTL_LOG_BUFSIZE	128
77 
78 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
79 
80 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81 static int rctl_log_rate_limit = 10;
82 static int rctl_devctl_rate_limit = 10;
83 
84 /*
85  * Values below are initialized in rctl_init().
86  */
87 static int rctl_throttle_min = -1;
88 static int rctl_throttle_max = -1;
89 static int rctl_throttle_pct = -1;
90 static int rctl_throttle_pct2 = -1;
91 
92 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96 
97 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
98 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
99     &rctl_maxbufsize, 0, "Maximum output buffer size");
100 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
101     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
102 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
103     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
104 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
105     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
106     "Shortest throttling duration, in hz");
107 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
108 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
109     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
110     "Longest throttling duration, in hz");
111 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
112 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
113     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
114     "Throttling penalty for process consumption, in percent");
115 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
116 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
117     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
118     "Throttling penalty for container consumption, in percent");
119 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
120 
121 /*
122  * 'rctl_rule_link' connects a rule with every racct it's related to.
123  * For example, rule 'user:X:openfiles:deny=N/process' is linked
124  * with uidinfo for user X, and to each process of that user.
125  */
126 struct rctl_rule_link {
127 	LIST_ENTRY(rctl_rule_link)	rrl_next;
128 	struct rctl_rule		*rrl_rule;
129 	int				rrl_exceeded;
130 };
131 
132 struct dict {
133 	const char	*d_name;
134 	int		d_value;
135 };
136 
137 static struct dict subjectnames[] = {
138 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
139 	{ "user", RCTL_SUBJECT_TYPE_USER },
140 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
141 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
142 	{ NULL, -1 }};
143 
144 static struct dict resourcenames[] = {
145 	{ "cputime", RACCT_CPU },
146 	{ "datasize", RACCT_DATA },
147 	{ "stacksize", RACCT_STACK },
148 	{ "coredumpsize", RACCT_CORE },
149 	{ "memoryuse", RACCT_RSS },
150 	{ "memorylocked", RACCT_MEMLOCK },
151 	{ "maxproc", RACCT_NPROC },
152 	{ "openfiles", RACCT_NOFILE },
153 	{ "vmemoryuse", RACCT_VMEM },
154 	{ "pseudoterminals", RACCT_NPTS },
155 	{ "swapuse", RACCT_SWAP },
156 	{ "nthr", RACCT_NTHR },
157 	{ "msgqqueued", RACCT_MSGQQUEUED },
158 	{ "msgqsize", RACCT_MSGQSIZE },
159 	{ "nmsgq", RACCT_NMSGQ },
160 	{ "nsem", RACCT_NSEM },
161 	{ "nsemop", RACCT_NSEMOP },
162 	{ "nshm", RACCT_NSHM },
163 	{ "shmsize", RACCT_SHMSIZE },
164 	{ "wallclock", RACCT_WALLCLOCK },
165 	{ "pcpu", RACCT_PCTCPU },
166 	{ "readbps", RACCT_READBPS },
167 	{ "writebps", RACCT_WRITEBPS },
168 	{ "readiops", RACCT_READIOPS },
169 	{ "writeiops", RACCT_WRITEIOPS },
170 	{ NULL, -1 }};
171 
172 static struct dict actionnames[] = {
173 	{ "sighup", RCTL_ACTION_SIGHUP },
174 	{ "sigint", RCTL_ACTION_SIGINT },
175 	{ "sigquit", RCTL_ACTION_SIGQUIT },
176 	{ "sigill", RCTL_ACTION_SIGILL },
177 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
178 	{ "sigabrt", RCTL_ACTION_SIGABRT },
179 	{ "sigemt", RCTL_ACTION_SIGEMT },
180 	{ "sigfpe", RCTL_ACTION_SIGFPE },
181 	{ "sigkill", RCTL_ACTION_SIGKILL },
182 	{ "sigbus", RCTL_ACTION_SIGBUS },
183 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
184 	{ "sigsys", RCTL_ACTION_SIGSYS },
185 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
186 	{ "sigalrm", RCTL_ACTION_SIGALRM },
187 	{ "sigterm", RCTL_ACTION_SIGTERM },
188 	{ "sigurg", RCTL_ACTION_SIGURG },
189 	{ "sigstop", RCTL_ACTION_SIGSTOP },
190 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
191 	{ "sigchld", RCTL_ACTION_SIGCHLD },
192 	{ "sigttin", RCTL_ACTION_SIGTTIN },
193 	{ "sigttou", RCTL_ACTION_SIGTTOU },
194 	{ "sigio", RCTL_ACTION_SIGIO },
195 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
196 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
197 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
198 	{ "sigprof", RCTL_ACTION_SIGPROF },
199 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
200 	{ "siginfo", RCTL_ACTION_SIGINFO },
201 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
202 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
203 	{ "sigthr", RCTL_ACTION_SIGTHR },
204 	{ "deny", RCTL_ACTION_DENY },
205 	{ "log", RCTL_ACTION_LOG },
206 	{ "devctl", RCTL_ACTION_DEVCTL },
207 	{ "throttle", RCTL_ACTION_THROTTLE },
208 	{ NULL, -1 }};
209 
210 static void rctl_init(void);
211 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
212 
213 static uma_zone_t rctl_rule_link_zone;
214 static uma_zone_t rctl_rule_zone;
215 static struct rwlock rctl_lock;
216 RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
217 
218 #define RCTL_RLOCK()		rw_rlock(&rctl_lock)
219 #define RCTL_RUNLOCK()		rw_runlock(&rctl_lock)
220 #define RCTL_WLOCK()		rw_wlock(&rctl_lock)
221 #define RCTL_WUNLOCK()		rw_wunlock(&rctl_lock)
222 #define RCTL_LOCK_ASSERT()	rw_assert(&rctl_lock, RA_LOCKED)
223 #define RCTL_WLOCK_ASSERT()	rw_assert(&rctl_lock, RA_WLOCKED)
224 
225 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
226 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
227 
228 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
229 
230 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
231 {
232 	int val = rctl_throttle_min;
233 	int error;
234 
235 	error = sysctl_handle_int(oidp, &val, 0, req);
236 	if (error || !req->newptr)
237 		return (error);
238 	if (val < 1 || val > rctl_throttle_max)
239 		return (EINVAL);
240 
241 	RCTL_WLOCK();
242 	rctl_throttle_min = val;
243 	RCTL_WUNLOCK();
244 
245 	return (0);
246 }
247 
248 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
249 {
250 	int val = rctl_throttle_max;
251 	int error;
252 
253 	error = sysctl_handle_int(oidp, &val, 0, req);
254 	if (error || !req->newptr)
255 		return (error);
256 	if (val < rctl_throttle_min)
257 		return (EINVAL);
258 
259 	RCTL_WLOCK();
260 	rctl_throttle_max = val;
261 	RCTL_WUNLOCK();
262 
263 	return (0);
264 }
265 
266 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
267 {
268 	int val = rctl_throttle_pct;
269 	int error;
270 
271 	error = sysctl_handle_int(oidp, &val, 0, req);
272 	if (error || !req->newptr)
273 		return (error);
274 	if (val < 0)
275 		return (EINVAL);
276 
277 	RCTL_WLOCK();
278 	rctl_throttle_pct = val;
279 	RCTL_WUNLOCK();
280 
281 	return (0);
282 }
283 
284 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
285 {
286 	int val = rctl_throttle_pct2;
287 	int error;
288 
289 	error = sysctl_handle_int(oidp, &val, 0, req);
290 	if (error || !req->newptr)
291 		return (error);
292 	if (val < 0)
293 		return (EINVAL);
294 
295 	RCTL_WLOCK();
296 	rctl_throttle_pct2 = val;
297 	RCTL_WUNLOCK();
298 
299 	return (0);
300 }
301 
302 static const char *
303 rctl_subject_type_name(int subject)
304 {
305 	int i;
306 
307 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
308 		if (subjectnames[i].d_value == subject)
309 			return (subjectnames[i].d_name);
310 	}
311 
312 	panic("rctl_subject_type_name: unknown subject type %d", subject);
313 }
314 
315 static const char *
316 rctl_action_name(int action)
317 {
318 	int i;
319 
320 	for (i = 0; actionnames[i].d_name != NULL; i++) {
321 		if (actionnames[i].d_value == action)
322 			return (actionnames[i].d_name);
323 	}
324 
325 	panic("rctl_action_name: unknown action %d", action);
326 }
327 
328 const char *
329 rctl_resource_name(int resource)
330 {
331 	int i;
332 
333 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
334 		if (resourcenames[i].d_value == resource)
335 			return (resourcenames[i].d_name);
336 	}
337 
338 	panic("rctl_resource_name: unknown resource %d", resource);
339 }
340 
341 static struct racct *
342 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
343 {
344 	struct ucred *cred = p->p_ucred;
345 
346 	ASSERT_RACCT_ENABLED();
347 	RCTL_LOCK_ASSERT();
348 
349 	switch (rule->rr_per) {
350 	case RCTL_SUBJECT_TYPE_PROCESS:
351 		return (p->p_racct);
352 	case RCTL_SUBJECT_TYPE_USER:
353 		return (cred->cr_ruidinfo->ui_racct);
354 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
355 		return (cred->cr_loginclass->lc_racct);
356 	case RCTL_SUBJECT_TYPE_JAIL:
357 		return (cred->cr_prison->pr_prison_racct->prr_racct);
358 	default:
359 		panic("%s: unknown per %d", __func__, rule->rr_per);
360 	}
361 }
362 
363 /*
364  * Return the amount of resource that can be allocated by 'p' before
365  * hitting 'rule'.
366  */
367 static int64_t
368 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
369 {
370 	int64_t available;
371 	const struct racct *racct;
372 
373 	ASSERT_RACCT_ENABLED();
374 	RCTL_LOCK_ASSERT();
375 
376 	racct = rctl_proc_rule_to_racct(p, rule);
377 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
378 
379 	return (available);
380 }
381 
382 /*
383  * Called every second for proc, uidinfo, loginclass, and jail containers.
384  * If the limit isn't exceeded, it decreases the usage amount to zero.
385  * Otherwise, it decreases it by the value of the limit.  This way
386  * resource consumption exceeding the limit "carries over" to the next
387  * period.
388  */
389 void
390 rctl_throttle_decay(struct racct *racct, int resource)
391 {
392 	struct rctl_rule *rule;
393 	struct rctl_rule_link *link;
394 	int64_t minavailable;
395 
396 	ASSERT_RACCT_ENABLED();
397 
398 	minavailable = INT64_MAX;
399 
400 	RCTL_RLOCK();
401 
402 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
403 		rule = link->rrl_rule;
404 
405 		if (rule->rr_resource != resource)
406 			continue;
407 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
408 			continue;
409 
410 		if (rule->rr_amount < minavailable)
411 			minavailable = rule->rr_amount;
412 	}
413 
414 	RCTL_RUNLOCK();
415 
416 	if (racct->r_resources[resource] < minavailable) {
417 		racct->r_resources[resource] = 0;
418 	} else {
419 		/*
420 		 * Cap utilization counter at ten times the limit.  Otherwise,
421 		 * if we changed the rule lowering the allowed amount, it could
422 		 * take unreasonably long time for the accumulated resource
423 		 * usage to drop.
424 		 */
425 		if (racct->r_resources[resource] > minavailable * 10)
426 			racct->r_resources[resource] = minavailable * 10;
427 
428 		racct->r_resources[resource] -= minavailable;
429 	}
430 }
431 
432 /*
433  * Special version of rctl_get_available() for the %CPU resource.
434  * We slightly cheat here and return less than we normally would.
435  */
436 int64_t
437 rctl_pcpu_available(const struct proc *p) {
438 	struct rctl_rule *rule;
439 	struct rctl_rule_link *link;
440 	int64_t available, minavailable, limit;
441 
442 	ASSERT_RACCT_ENABLED();
443 
444 	minavailable = INT64_MAX;
445 	limit = 0;
446 
447 	RCTL_RLOCK();
448 
449 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
450 		rule = link->rrl_rule;
451 		if (rule->rr_resource != RACCT_PCTCPU)
452 			continue;
453 		if (rule->rr_action != RCTL_ACTION_DENY)
454 			continue;
455 		available = rctl_available_resource(p, rule);
456 		if (available < minavailable) {
457 			minavailable = available;
458 			limit = rule->rr_amount;
459 		}
460 	}
461 
462 	RCTL_RUNLOCK();
463 
464 	/*
465 	 * Return slightly less than actual value of the available
466 	 * %cpu resource.  This makes %cpu throttling more agressive
467 	 * and lets us act sooner than the limits are already exceeded.
468 	 */
469 	if (limit != 0) {
470 		if (limit > 2 * RCTL_PCPU_SHIFT)
471 			minavailable -= RCTL_PCPU_SHIFT;
472 		else
473 			minavailable -= (limit / 2);
474 	}
475 
476 	return (minavailable);
477 }
478 
479 static uint64_t
480 xadd(uint64_t a, uint64_t b)
481 {
482 	uint64_t c;
483 
484 	c = a + b;
485 
486 	/*
487 	 * Detect overflow.
488 	 */
489 	if (c < a || c < b)
490 		return (UINT64_MAX);
491 
492 	return (c);
493 }
494 
495 static uint64_t
496 xmul(uint64_t a, uint64_t b)
497 {
498 
499 	if (b != 0 && a > UINT64_MAX / b)
500 		return (UINT64_MAX);
501 
502 	return (a * b);
503 }
504 
505 /*
506  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
507  * to what it keeps allocated now.  Returns non-zero if the allocation should
508  * be denied, 0 otherwise.
509  */
510 int
511 rctl_enforce(struct proc *p, int resource, uint64_t amount)
512 {
513 	static struct timeval log_lasttime, devctl_lasttime;
514 	static int log_curtime = 0, devctl_curtime = 0;
515 	struct rctl_rule *rule;
516 	struct rctl_rule_link *link;
517 	struct sbuf sb;
518 	int64_t available;
519 	uint64_t sleep_ms, sleep_ratio;
520 	int should_deny = 0;
521 	char *buf;
522 
523 
524 	ASSERT_RACCT_ENABLED();
525 
526 	RCTL_RLOCK();
527 
528 	/*
529 	 * There may be more than one matching rule; go through all of them.
530 	 * Denial should be done last, after logging and sending signals.
531 	 */
532 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
533 		rule = link->rrl_rule;
534 		if (rule->rr_resource != resource)
535 			continue;
536 
537 		available = rctl_available_resource(p, rule);
538 		if (available >= (int64_t)amount) {
539 			link->rrl_exceeded = 0;
540 			continue;
541 		}
542 
543 		switch (rule->rr_action) {
544 		case RCTL_ACTION_DENY:
545 			should_deny = 1;
546 			continue;
547 		case RCTL_ACTION_LOG:
548 			/*
549 			 * If rrl_exceeded != 0, it means we've already
550 			 * logged a warning for this process.
551 			 */
552 			if (link->rrl_exceeded != 0)
553 				continue;
554 
555 			/*
556 			 * If the process state is not fully initialized yet,
557 			 * we can't access most of the required fields, e.g.
558 			 * p->p_comm.  This happens when called from fork1().
559 			 * Ignore this rule for now; it will be processed just
560 			 * after fork, when called from racct_proc_fork_done().
561 			 */
562 			if (p->p_state != PRS_NORMAL)
563 				continue;
564 
565 			if (!ppsratecheck(&log_lasttime, &log_curtime,
566 			    rctl_log_rate_limit))
567 				continue;
568 
569 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
570 			if (buf == NULL) {
571 				printf("rctl_enforce: out of memory\n");
572 				continue;
573 			}
574 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
575 			rctl_rule_to_sbuf(&sb, rule);
576 			sbuf_finish(&sb);
577 			printf("rctl: rule \"%s\" matched by pid %d "
578 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
579 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
580 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
581 			sbuf_delete(&sb);
582 			free(buf, M_RCTL);
583 			link->rrl_exceeded = 1;
584 			continue;
585 		case RCTL_ACTION_DEVCTL:
586 			if (link->rrl_exceeded != 0)
587 				continue;
588 
589 			if (p->p_state != PRS_NORMAL)
590 				continue;
591 
592 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
593 			    rctl_devctl_rate_limit))
594 				continue;
595 
596 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
597 			if (buf == NULL) {
598 				printf("rctl_enforce: out of memory\n");
599 				continue;
600 			}
601 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
602 			sbuf_printf(&sb, "rule=");
603 			rctl_rule_to_sbuf(&sb, rule);
604 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
605 			    p->p_pid, p->p_ucred->cr_ruid,
606 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
607 			sbuf_finish(&sb);
608 			devctl_notify_f("RCTL", "rule", "matched",
609 			    sbuf_data(&sb), M_NOWAIT);
610 			sbuf_delete(&sb);
611 			free(buf, M_RCTL);
612 			link->rrl_exceeded = 1;
613 			continue;
614 		case RCTL_ACTION_THROTTLE:
615 			if (p->p_state != PRS_NORMAL)
616 				continue;
617 
618 			/*
619 			 * Make the process sleep for a fraction of second
620 			 * proportional to the ratio of process' resource
621 			 * utilization compared to the limit.  The point is
622 			 * to penalize resource hogs: processes that consume
623 			 * more of the available resources sleep for longer.
624 			 *
625 			 * We're trying to defer division until the very end,
626 			 * to minimize the rounding effects.  The following
627 			 * calculation could have been written in a clearer
628 			 * way like this:
629 			 *
630 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
631 			 *     rule->rr_amount;
632 			 * sleep_ms *= rctl_throttle_pct / 100;
633 			 * if (sleep_ms < rctl_throttle_min)
634 			 *         sleep_ms = rctl_throttle_min;
635 			 *
636 			 */
637 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
638 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
639 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
640 				sleep_ms = rctl_throttle_min * rule->rr_amount;
641 
642 			/*
643 			 * Multiply that by the ratio of the resource
644 			 * consumption for the container compared to the limit,
645 			 * squared.  In other words, a process in a container
646 			 * that is two times over the limit will be throttled
647 			 * four times as much for hitting the same rule.  The
648 			 * point is to penalize processes more if the container
649 			 * itself (eg certain UID or jail) is above the limit.
650 			 */
651 			if (available < 0)
652 				sleep_ratio = -available / rule->rr_amount;
653 			else
654 				sleep_ratio = 0;
655 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
656 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
657 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
658 
659 			/*
660 			 * Finally the division.
661 			 */
662 			sleep_ms /= rule->rr_amount;
663 
664 			if (sleep_ms > rctl_throttle_max)
665 				sleep_ms = rctl_throttle_max;
666 #if 0
667 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
668 			   __func__, p->p_pid, p->p_comm,
669 			   p->p_racct->r_resources[resource],
670 			   rule->rr_amount, sleep_ms, sleep_ratio, available);
671 #endif
672 
673 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
674 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
675 			racct_proc_throttle(p, sleep_ms);
676 			continue;
677 		default:
678 			if (link->rrl_exceeded != 0)
679 				continue;
680 
681 			if (p->p_state != PRS_NORMAL)
682 				continue;
683 
684 			KASSERT(rule->rr_action > 0 &&
685 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
686 			    ("rctl_enforce: unknown action %d",
687 			     rule->rr_action));
688 
689 			/*
690 			 * We're using the fact that RCTL_ACTION_SIG* values
691 			 * are equal to their counterparts from sys/signal.h.
692 			 */
693 			kern_psignal(p, rule->rr_action);
694 			link->rrl_exceeded = 1;
695 			continue;
696 		}
697 	}
698 
699 	RCTL_RUNLOCK();
700 
701 	if (should_deny) {
702 		/*
703 		 * Return fake error code; the caller should change it
704 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
705 		 */
706 		return (EDOOFUS);
707 	}
708 
709 	return (0);
710 }
711 
712 uint64_t
713 rctl_get_limit(struct proc *p, int resource)
714 {
715 	struct rctl_rule *rule;
716 	struct rctl_rule_link *link;
717 	uint64_t amount = UINT64_MAX;
718 
719 	ASSERT_RACCT_ENABLED();
720 
721 	RCTL_RLOCK();
722 
723 	/*
724 	 * There may be more than one matching rule; go through all of them.
725 	 * Denial should be done last, after logging and sending signals.
726 	 */
727 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
728 		rule = link->rrl_rule;
729 		if (rule->rr_resource != resource)
730 			continue;
731 		if (rule->rr_action != RCTL_ACTION_DENY)
732 			continue;
733 		if (rule->rr_amount < amount)
734 			amount = rule->rr_amount;
735 	}
736 
737 	RCTL_RUNLOCK();
738 
739 	return (amount);
740 }
741 
742 uint64_t
743 rctl_get_available(struct proc *p, int resource)
744 {
745 	struct rctl_rule *rule;
746 	struct rctl_rule_link *link;
747 	int64_t available, minavailable, allocated;
748 
749 	minavailable = INT64_MAX;
750 
751 	ASSERT_RACCT_ENABLED();
752 
753 	RCTL_RLOCK();
754 
755 	/*
756 	 * There may be more than one matching rule; go through all of them.
757 	 * Denial should be done last, after logging and sending signals.
758 	 */
759 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
760 		rule = link->rrl_rule;
761 		if (rule->rr_resource != resource)
762 			continue;
763 		if (rule->rr_action != RCTL_ACTION_DENY)
764 			continue;
765 		available = rctl_available_resource(p, rule);
766 		if (available < minavailable)
767 			minavailable = available;
768 	}
769 
770 	RCTL_RUNLOCK();
771 
772 	/*
773 	 * XXX: Think about this _hard_.
774 	 */
775 	allocated = p->p_racct->r_resources[resource];
776 	if (minavailable < INT64_MAX - allocated)
777 		minavailable += allocated;
778 	if (minavailable < 0)
779 		minavailable = 0;
780 	return (minavailable);
781 }
782 
783 static int
784 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
785 {
786 
787 	ASSERT_RACCT_ENABLED();
788 
789 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
790 		if (rule->rr_subject_type != filter->rr_subject_type)
791 			return (0);
792 
793 		switch (filter->rr_subject_type) {
794 		case RCTL_SUBJECT_TYPE_PROCESS:
795 			if (filter->rr_subject.rs_proc != NULL &&
796 			    rule->rr_subject.rs_proc !=
797 			    filter->rr_subject.rs_proc)
798 				return (0);
799 			break;
800 		case RCTL_SUBJECT_TYPE_USER:
801 			if (filter->rr_subject.rs_uip != NULL &&
802 			    rule->rr_subject.rs_uip !=
803 			    filter->rr_subject.rs_uip)
804 				return (0);
805 			break;
806 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
807 			if (filter->rr_subject.rs_loginclass != NULL &&
808 			    rule->rr_subject.rs_loginclass !=
809 			    filter->rr_subject.rs_loginclass)
810 				return (0);
811 			break;
812 		case RCTL_SUBJECT_TYPE_JAIL:
813 			if (filter->rr_subject.rs_prison_racct != NULL &&
814 			    rule->rr_subject.rs_prison_racct !=
815 			    filter->rr_subject.rs_prison_racct)
816 				return (0);
817 			break;
818 		default:
819 			panic("rctl_rule_matches: unknown subject type %d",
820 			    filter->rr_subject_type);
821 		}
822 	}
823 
824 	if (filter->rr_resource != RACCT_UNDEFINED) {
825 		if (rule->rr_resource != filter->rr_resource)
826 			return (0);
827 	}
828 
829 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
830 		if (rule->rr_action != filter->rr_action)
831 			return (0);
832 	}
833 
834 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
835 		if (rule->rr_amount != filter->rr_amount)
836 			return (0);
837 	}
838 
839 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
840 		if (rule->rr_per != filter->rr_per)
841 			return (0);
842 	}
843 
844 	return (1);
845 }
846 
847 static int
848 str2value(const char *str, int *value, struct dict *table)
849 {
850 	int i;
851 
852 	if (value == NULL)
853 		return (EINVAL);
854 
855 	for (i = 0; table[i].d_name != NULL; i++) {
856 		if (strcasecmp(table[i].d_name, str) == 0) {
857 			*value =  table[i].d_value;
858 			return (0);
859 		}
860 	}
861 
862 	return (EINVAL);
863 }
864 
865 static int
866 str2id(const char *str, id_t *value)
867 {
868 	char *end;
869 
870 	if (str == NULL)
871 		return (EINVAL);
872 
873 	*value = strtoul(str, &end, 10);
874 	if ((size_t)(end - str) != strlen(str))
875 		return (EINVAL);
876 
877 	return (0);
878 }
879 
880 static int
881 str2int64(const char *str, int64_t *value)
882 {
883 	char *end;
884 
885 	if (str == NULL)
886 		return (EINVAL);
887 
888 	*value = strtoul(str, &end, 10);
889 	if ((size_t)(end - str) != strlen(str))
890 		return (EINVAL);
891 
892 	if (*value < 0)
893 		return (ERANGE);
894 
895 	return (0);
896 }
897 
898 /*
899  * Connect the rule to the racct, increasing refcount for the rule.
900  */
901 static void
902 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
903 {
904 	struct rctl_rule_link *link;
905 
906 	ASSERT_RACCT_ENABLED();
907 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
908 
909 	rctl_rule_acquire(rule);
910 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
911 	link->rrl_rule = rule;
912 	link->rrl_exceeded = 0;
913 
914 	RCTL_WLOCK();
915 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
916 	RCTL_WUNLOCK();
917 }
918 
919 static int
920 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
921 {
922 	struct rctl_rule_link *link;
923 
924 	ASSERT_RACCT_ENABLED();
925 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
926 	RCTL_WLOCK_ASSERT();
927 
928 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
929 	if (link == NULL)
930 		return (ENOMEM);
931 	rctl_rule_acquire(rule);
932 	link->rrl_rule = rule;
933 	link->rrl_exceeded = 0;
934 
935 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
936 	return (0);
937 }
938 
939 /*
940  * Remove limits for a rules matching the filter and release
941  * the refcounts for the rules, possibly freeing them.  Returns
942  * the number of limit structures removed.
943  */
944 static int
945 rctl_racct_remove_rules(struct racct *racct,
946     const struct rctl_rule *filter)
947 {
948 	int removed = 0;
949 	struct rctl_rule_link *link, *linktmp;
950 
951 	ASSERT_RACCT_ENABLED();
952 	RCTL_WLOCK_ASSERT();
953 
954 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
955 		if (!rctl_rule_matches(link->rrl_rule, filter))
956 			continue;
957 
958 		LIST_REMOVE(link, rrl_next);
959 		rctl_rule_release(link->rrl_rule);
960 		uma_zfree(rctl_rule_link_zone, link);
961 		removed++;
962 	}
963 	return (removed);
964 }
965 
966 static void
967 rctl_rule_acquire_subject(struct rctl_rule *rule)
968 {
969 
970 	ASSERT_RACCT_ENABLED();
971 
972 	switch (rule->rr_subject_type) {
973 	case RCTL_SUBJECT_TYPE_UNDEFINED:
974 	case RCTL_SUBJECT_TYPE_PROCESS:
975 		break;
976 	case RCTL_SUBJECT_TYPE_JAIL:
977 		if (rule->rr_subject.rs_prison_racct != NULL)
978 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
979 		break;
980 	case RCTL_SUBJECT_TYPE_USER:
981 		if (rule->rr_subject.rs_uip != NULL)
982 			uihold(rule->rr_subject.rs_uip);
983 		break;
984 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
985 		if (rule->rr_subject.rs_loginclass != NULL)
986 			loginclass_hold(rule->rr_subject.rs_loginclass);
987 		break;
988 	default:
989 		panic("rctl_rule_acquire_subject: unknown subject type %d",
990 		    rule->rr_subject_type);
991 	}
992 }
993 
994 static void
995 rctl_rule_release_subject(struct rctl_rule *rule)
996 {
997 
998 	ASSERT_RACCT_ENABLED();
999 
1000 	switch (rule->rr_subject_type) {
1001 	case RCTL_SUBJECT_TYPE_UNDEFINED:
1002 	case RCTL_SUBJECT_TYPE_PROCESS:
1003 		break;
1004 	case RCTL_SUBJECT_TYPE_JAIL:
1005 		if (rule->rr_subject.rs_prison_racct != NULL)
1006 			prison_racct_free(rule->rr_subject.rs_prison_racct);
1007 		break;
1008 	case RCTL_SUBJECT_TYPE_USER:
1009 		if (rule->rr_subject.rs_uip != NULL)
1010 			uifree(rule->rr_subject.rs_uip);
1011 		break;
1012 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1013 		if (rule->rr_subject.rs_loginclass != NULL)
1014 			loginclass_free(rule->rr_subject.rs_loginclass);
1015 		break;
1016 	default:
1017 		panic("rctl_rule_release_subject: unknown subject type %d",
1018 		    rule->rr_subject_type);
1019 	}
1020 }
1021 
1022 struct rctl_rule *
1023 rctl_rule_alloc(int flags)
1024 {
1025 	struct rctl_rule *rule;
1026 
1027 	ASSERT_RACCT_ENABLED();
1028 
1029 	rule = uma_zalloc(rctl_rule_zone, flags);
1030 	if (rule == NULL)
1031 		return (NULL);
1032 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1033 	rule->rr_subject.rs_proc = NULL;
1034 	rule->rr_subject.rs_uip = NULL;
1035 	rule->rr_subject.rs_loginclass = NULL;
1036 	rule->rr_subject.rs_prison_racct = NULL;
1037 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1038 	rule->rr_resource = RACCT_UNDEFINED;
1039 	rule->rr_action = RCTL_ACTION_UNDEFINED;
1040 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1041 	refcount_init(&rule->rr_refcount, 1);
1042 
1043 	return (rule);
1044 }
1045 
1046 struct rctl_rule *
1047 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1048 {
1049 	struct rctl_rule *copy;
1050 
1051 	ASSERT_RACCT_ENABLED();
1052 
1053 	copy = uma_zalloc(rctl_rule_zone, flags);
1054 	if (copy == NULL)
1055 		return (NULL);
1056 	copy->rr_subject_type = rule->rr_subject_type;
1057 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1058 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1059 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1060 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1061 	copy->rr_per = rule->rr_per;
1062 	copy->rr_resource = rule->rr_resource;
1063 	copy->rr_action = rule->rr_action;
1064 	copy->rr_amount = rule->rr_amount;
1065 	refcount_init(&copy->rr_refcount, 1);
1066 	rctl_rule_acquire_subject(copy);
1067 
1068 	return (copy);
1069 }
1070 
1071 void
1072 rctl_rule_acquire(struct rctl_rule *rule)
1073 {
1074 
1075 	ASSERT_RACCT_ENABLED();
1076 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1077 
1078 	refcount_acquire(&rule->rr_refcount);
1079 }
1080 
1081 static void
1082 rctl_rule_free(void *context, int pending)
1083 {
1084 	struct rctl_rule *rule;
1085 
1086 	rule = (struct rctl_rule *)context;
1087 
1088 	ASSERT_RACCT_ENABLED();
1089 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1090 
1091 	/*
1092 	 * We don't need locking here; rule is guaranteed to be inaccessible.
1093 	 */
1094 
1095 	rctl_rule_release_subject(rule);
1096 	uma_zfree(rctl_rule_zone, rule);
1097 }
1098 
1099 void
1100 rctl_rule_release(struct rctl_rule *rule)
1101 {
1102 
1103 	ASSERT_RACCT_ENABLED();
1104 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1105 
1106 	if (refcount_release(&rule->rr_refcount)) {
1107 		/*
1108 		 * rctl_rule_release() is often called when iterating
1109 		 * over all the uidinfo structures in the system,
1110 		 * holding uihashtbl_lock.  Since rctl_rule_free()
1111 		 * might end up calling uifree(), this would lead
1112 		 * to lock recursion.  Use taskqueue to avoid this.
1113 		 */
1114 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1115 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1116 	}
1117 }
1118 
1119 static int
1120 rctl_rule_fully_specified(const struct rctl_rule *rule)
1121 {
1122 
1123 	ASSERT_RACCT_ENABLED();
1124 
1125 	switch (rule->rr_subject_type) {
1126 	case RCTL_SUBJECT_TYPE_UNDEFINED:
1127 		return (0);
1128 	case RCTL_SUBJECT_TYPE_PROCESS:
1129 		if (rule->rr_subject.rs_proc == NULL)
1130 			return (0);
1131 		break;
1132 	case RCTL_SUBJECT_TYPE_USER:
1133 		if (rule->rr_subject.rs_uip == NULL)
1134 			return (0);
1135 		break;
1136 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1137 		if (rule->rr_subject.rs_loginclass == NULL)
1138 			return (0);
1139 		break;
1140 	case RCTL_SUBJECT_TYPE_JAIL:
1141 		if (rule->rr_subject.rs_prison_racct == NULL)
1142 			return (0);
1143 		break;
1144 	default:
1145 		panic("rctl_rule_fully_specified: unknown subject type %d",
1146 		    rule->rr_subject_type);
1147 	}
1148 	if (rule->rr_resource == RACCT_UNDEFINED)
1149 		return (0);
1150 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1151 		return (0);
1152 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1153 		return (0);
1154 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1155 		return (0);
1156 
1157 	return (1);
1158 }
1159 
1160 static int
1161 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1162 {
1163 	int error = 0;
1164 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1165 	     *amountstr, *perstr;
1166 	struct rctl_rule *rule;
1167 	id_t id;
1168 
1169 	ASSERT_RACCT_ENABLED();
1170 
1171 	rule = rctl_rule_alloc(M_WAITOK);
1172 
1173 	subjectstr = strsep(&rulestr, ":");
1174 	subject_idstr = strsep(&rulestr, ":");
1175 	resourcestr = strsep(&rulestr, ":");
1176 	actionstr = strsep(&rulestr, "=/");
1177 	amountstr = strsep(&rulestr, "/");
1178 	perstr = rulestr;
1179 
1180 	if (subjectstr == NULL || subjectstr[0] == '\0')
1181 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1182 	else {
1183 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1184 		if (error != 0)
1185 			goto out;
1186 	}
1187 
1188 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1189 		rule->rr_subject.rs_proc = NULL;
1190 		rule->rr_subject.rs_uip = NULL;
1191 		rule->rr_subject.rs_loginclass = NULL;
1192 		rule->rr_subject.rs_prison_racct = NULL;
1193 	} else {
1194 		switch (rule->rr_subject_type) {
1195 		case RCTL_SUBJECT_TYPE_UNDEFINED:
1196 			error = EINVAL;
1197 			goto out;
1198 		case RCTL_SUBJECT_TYPE_PROCESS:
1199 			error = str2id(subject_idstr, &id);
1200 			if (error != 0)
1201 				goto out;
1202 			sx_assert(&allproc_lock, SA_LOCKED);
1203 			rule->rr_subject.rs_proc = pfind(id);
1204 			if (rule->rr_subject.rs_proc == NULL) {
1205 				error = ESRCH;
1206 				goto out;
1207 			}
1208 			PROC_UNLOCK(rule->rr_subject.rs_proc);
1209 			break;
1210 		case RCTL_SUBJECT_TYPE_USER:
1211 			error = str2id(subject_idstr, &id);
1212 			if (error != 0)
1213 				goto out;
1214 			rule->rr_subject.rs_uip = uifind(id);
1215 			break;
1216 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1217 			rule->rr_subject.rs_loginclass =
1218 			    loginclass_find(subject_idstr);
1219 			if (rule->rr_subject.rs_loginclass == NULL) {
1220 				error = ENAMETOOLONG;
1221 				goto out;
1222 			}
1223 			break;
1224 		case RCTL_SUBJECT_TYPE_JAIL:
1225 			rule->rr_subject.rs_prison_racct =
1226 			    prison_racct_find(subject_idstr);
1227 			if (rule->rr_subject.rs_prison_racct == NULL) {
1228 				error = ENAMETOOLONG;
1229 				goto out;
1230 			}
1231 			break;
1232                default:
1233                        panic("rctl_string_to_rule: unknown subject type %d",
1234                            rule->rr_subject_type);
1235                }
1236 	}
1237 
1238 	if (resourcestr == NULL || resourcestr[0] == '\0')
1239 		rule->rr_resource = RACCT_UNDEFINED;
1240 	else {
1241 		error = str2value(resourcestr, &rule->rr_resource,
1242 		    resourcenames);
1243 		if (error != 0)
1244 			goto out;
1245 	}
1246 
1247 	if (actionstr == NULL || actionstr[0] == '\0')
1248 		rule->rr_action = RCTL_ACTION_UNDEFINED;
1249 	else {
1250 		error = str2value(actionstr, &rule->rr_action, actionnames);
1251 		if (error != 0)
1252 			goto out;
1253 	}
1254 
1255 	if (amountstr == NULL || amountstr[0] == '\0')
1256 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1257 	else {
1258 		error = str2int64(amountstr, &rule->rr_amount);
1259 		if (error != 0)
1260 			goto out;
1261 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1262 			if (rule->rr_amount > INT64_MAX / 1000000) {
1263 				error = ERANGE;
1264 				goto out;
1265 			}
1266 			rule->rr_amount *= 1000000;
1267 		}
1268 	}
1269 
1270 	if (perstr == NULL || perstr[0] == '\0')
1271 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1272 	else {
1273 		error = str2value(perstr, &rule->rr_per, subjectnames);
1274 		if (error != 0)
1275 			goto out;
1276 	}
1277 
1278 out:
1279 	if (error == 0)
1280 		*rulep = rule;
1281 	else
1282 		rctl_rule_release(rule);
1283 
1284 	return (error);
1285 }
1286 
1287 /*
1288  * Link a rule with all the subjects it applies to.
1289  */
1290 int
1291 rctl_rule_add(struct rctl_rule *rule)
1292 {
1293 	struct proc *p;
1294 	struct ucred *cred;
1295 	struct uidinfo *uip;
1296 	struct prison *pr;
1297 	struct prison_racct *prr;
1298 	struct loginclass *lc;
1299 	struct rctl_rule *rule2;
1300 	int match;
1301 
1302 	ASSERT_RACCT_ENABLED();
1303 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1304 
1305 	/*
1306 	 * Some rules just don't make sense, like "deny" rule for an undeniable
1307 	 * resource.  The exception are the RSS and %CPU resources - they are
1308 	 * not deniable in the racct sense, but the limit is enforced in
1309 	 * a different way.
1310 	 */
1311 	if (rule->rr_action == RCTL_ACTION_DENY &&
1312 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1313 	    rule->rr_resource != RACCT_RSS &&
1314 	    rule->rr_resource != RACCT_PCTCPU) {
1315 		return (EOPNOTSUPP);
1316 	}
1317 
1318 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1319 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1320 		return (EOPNOTSUPP);
1321 	}
1322 
1323 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1324 	    rule->rr_resource == RACCT_PCTCPU) {
1325 		return (EOPNOTSUPP);
1326 	}
1327 
1328 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1329 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1330 		return (EOPNOTSUPP);
1331 	}
1332 
1333 	/*
1334 	 * Make sure there are no duplicated rules.  Also, for the "deny"
1335 	 * rules, remove ones differing only by "amount".
1336 	 */
1337 	if (rule->rr_action == RCTL_ACTION_DENY) {
1338 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1339 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1340 		rctl_rule_remove(rule2);
1341 		rctl_rule_release(rule2);
1342 	} else
1343 		rctl_rule_remove(rule);
1344 
1345 	switch (rule->rr_subject_type) {
1346 	case RCTL_SUBJECT_TYPE_PROCESS:
1347 		p = rule->rr_subject.rs_proc;
1348 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1349 
1350 		rctl_racct_add_rule(p->p_racct, rule);
1351 		/*
1352 		 * In case of per-process rule, we don't have anything more
1353 		 * to do.
1354 		 */
1355 		return (0);
1356 
1357 	case RCTL_SUBJECT_TYPE_USER:
1358 		uip = rule->rr_subject.rs_uip;
1359 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1360 		rctl_racct_add_rule(uip->ui_racct, rule);
1361 		break;
1362 
1363 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1364 		lc = rule->rr_subject.rs_loginclass;
1365 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1366 		rctl_racct_add_rule(lc->lc_racct, rule);
1367 		break;
1368 
1369 	case RCTL_SUBJECT_TYPE_JAIL:
1370 		prr = rule->rr_subject.rs_prison_racct;
1371 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1372 		rctl_racct_add_rule(prr->prr_racct, rule);
1373 		break;
1374 
1375 	default:
1376 		panic("rctl_rule_add: unknown subject type %d",
1377 		    rule->rr_subject_type);
1378 	}
1379 
1380 	/*
1381 	 * Now go through all the processes and add the new rule to the ones
1382 	 * it applies to.
1383 	 */
1384 	sx_assert(&allproc_lock, SA_LOCKED);
1385 	FOREACH_PROC_IN_SYSTEM(p) {
1386 		cred = p->p_ucred;
1387 		switch (rule->rr_subject_type) {
1388 		case RCTL_SUBJECT_TYPE_USER:
1389 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1390 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1391 				break;
1392 			continue;
1393 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1394 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1395 				break;
1396 			continue;
1397 		case RCTL_SUBJECT_TYPE_JAIL:
1398 			match = 0;
1399 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1400 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1401 					match = 1;
1402 					break;
1403 				}
1404 			}
1405 			if (match)
1406 				break;
1407 			continue;
1408 		default:
1409 			panic("rctl_rule_add: unknown subject type %d",
1410 			    rule->rr_subject_type);
1411 		}
1412 
1413 		rctl_racct_add_rule(p->p_racct, rule);
1414 	}
1415 
1416 	return (0);
1417 }
1418 
1419 static void
1420 rctl_rule_pre_callback(void)
1421 {
1422 
1423 	RCTL_WLOCK();
1424 }
1425 
1426 static void
1427 rctl_rule_post_callback(void)
1428 {
1429 
1430 	RCTL_WUNLOCK();
1431 }
1432 
1433 static void
1434 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1435 {
1436 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1437 	int found = 0;
1438 
1439 	ASSERT_RACCT_ENABLED();
1440 	RCTL_WLOCK_ASSERT();
1441 
1442 	found += rctl_racct_remove_rules(racct, filter);
1443 
1444 	*((int *)arg3) += found;
1445 }
1446 
1447 /*
1448  * Remove all rules that match the filter.
1449  */
1450 int
1451 rctl_rule_remove(struct rctl_rule *filter)
1452 {
1453 	int found = 0;
1454 	struct proc *p;
1455 
1456 	ASSERT_RACCT_ENABLED();
1457 
1458 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1459 	    filter->rr_subject.rs_proc != NULL) {
1460 		p = filter->rr_subject.rs_proc;
1461 		RCTL_WLOCK();
1462 		found = rctl_racct_remove_rules(p->p_racct, filter);
1463 		RCTL_WUNLOCK();
1464 		if (found)
1465 			return (0);
1466 		return (ESRCH);
1467 	}
1468 
1469 	loginclass_racct_foreach(rctl_rule_remove_callback,
1470 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1471 	    filter, (void *)&found);
1472 	ui_racct_foreach(rctl_rule_remove_callback,
1473 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1474 	    filter, (void *)&found);
1475 	prison_racct_foreach(rctl_rule_remove_callback,
1476 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1477 	    filter, (void *)&found);
1478 
1479 	sx_assert(&allproc_lock, SA_LOCKED);
1480 	RCTL_WLOCK();
1481 	FOREACH_PROC_IN_SYSTEM(p) {
1482 		found += rctl_racct_remove_rules(p->p_racct, filter);
1483 	}
1484 	RCTL_WUNLOCK();
1485 
1486 	if (found)
1487 		return (0);
1488 	return (ESRCH);
1489 }
1490 
1491 /*
1492  * Appends a rule to the sbuf.
1493  */
1494 static void
1495 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1496 {
1497 	int64_t amount;
1498 
1499 	ASSERT_RACCT_ENABLED();
1500 
1501 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1502 
1503 	switch (rule->rr_subject_type) {
1504 	case RCTL_SUBJECT_TYPE_PROCESS:
1505 		if (rule->rr_subject.rs_proc == NULL)
1506 			sbuf_printf(sb, ":");
1507 		else
1508 			sbuf_printf(sb, "%d:",
1509 			    rule->rr_subject.rs_proc->p_pid);
1510 		break;
1511 	case RCTL_SUBJECT_TYPE_USER:
1512 		if (rule->rr_subject.rs_uip == NULL)
1513 			sbuf_printf(sb, ":");
1514 		else
1515 			sbuf_printf(sb, "%d:",
1516 			    rule->rr_subject.rs_uip->ui_uid);
1517 		break;
1518 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1519 		if (rule->rr_subject.rs_loginclass == NULL)
1520 			sbuf_printf(sb, ":");
1521 		else
1522 			sbuf_printf(sb, "%s:",
1523 			    rule->rr_subject.rs_loginclass->lc_name);
1524 		break;
1525 	case RCTL_SUBJECT_TYPE_JAIL:
1526 		if (rule->rr_subject.rs_prison_racct == NULL)
1527 			sbuf_printf(sb, ":");
1528 		else
1529 			sbuf_printf(sb, "%s:",
1530 			    rule->rr_subject.rs_prison_racct->prr_name);
1531 		break;
1532 	default:
1533 		panic("rctl_rule_to_sbuf: unknown subject type %d",
1534 		    rule->rr_subject_type);
1535 	}
1536 
1537 	amount = rule->rr_amount;
1538 	if (amount != RCTL_AMOUNT_UNDEFINED &&
1539 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1540 		amount /= 1000000;
1541 
1542 	sbuf_printf(sb, "%s:%s=%jd",
1543 	    rctl_resource_name(rule->rr_resource),
1544 	    rctl_action_name(rule->rr_action),
1545 	    amount);
1546 
1547 	if (rule->rr_per != rule->rr_subject_type)
1548 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1549 }
1550 
1551 /*
1552  * Routine used by RCTL syscalls to read in input string.
1553  */
1554 static int
1555 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1556 {
1557 	int error;
1558 	char *str;
1559 
1560 	ASSERT_RACCT_ENABLED();
1561 
1562 	if (inbuflen <= 0)
1563 		return (EINVAL);
1564 	if (inbuflen > RCTL_MAX_INBUFSIZE)
1565 		return (E2BIG);
1566 
1567 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1568 	error = copyinstr(inbufp, str, inbuflen, NULL);
1569 	if (error != 0) {
1570 		free(str, M_RCTL);
1571 		return (error);
1572 	}
1573 
1574 	*inputstr = str;
1575 
1576 	return (0);
1577 }
1578 
1579 /*
1580  * Routine used by RCTL syscalls to write out output string.
1581  */
1582 static int
1583 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1584 {
1585 	int error;
1586 
1587 	ASSERT_RACCT_ENABLED();
1588 
1589 	if (outputsbuf == NULL)
1590 		return (0);
1591 
1592 	sbuf_finish(outputsbuf);
1593 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1594 		sbuf_delete(outputsbuf);
1595 		return (ERANGE);
1596 	}
1597 	error = copyout(sbuf_data(outputsbuf), outbufp,
1598 	    sbuf_len(outputsbuf) + 1);
1599 	sbuf_delete(outputsbuf);
1600 	return (error);
1601 }
1602 
1603 static struct sbuf *
1604 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1605 {
1606 	int i;
1607 	int64_t amount;
1608 	struct sbuf *sb;
1609 
1610 	ASSERT_RACCT_ENABLED();
1611 
1612 	sb = sbuf_new_auto();
1613 	for (i = 0; i <= RACCT_MAX; i++) {
1614 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1615 			continue;
1616 		amount = racct->r_resources[i];
1617 		if (RACCT_IS_IN_MILLIONS(i))
1618 			amount /= 1000000;
1619 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1620 	}
1621 	sbuf_setpos(sb, sbuf_len(sb) - 1);
1622 	return (sb);
1623 }
1624 
1625 int
1626 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1627 {
1628 	int error;
1629 	char *inputstr;
1630 	struct rctl_rule *filter;
1631 	struct sbuf *outputsbuf = NULL;
1632 	struct proc *p;
1633 	struct uidinfo *uip;
1634 	struct loginclass *lc;
1635 	struct prison_racct *prr;
1636 
1637 	if (!racct_enable)
1638 		return (ENOSYS);
1639 
1640 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1641 	if (error != 0)
1642 		return (error);
1643 
1644 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1645 	if (error != 0)
1646 		return (error);
1647 
1648 	sx_slock(&allproc_lock);
1649 	error = rctl_string_to_rule(inputstr, &filter);
1650 	free(inputstr, M_RCTL);
1651 	if (error != 0) {
1652 		sx_sunlock(&allproc_lock);
1653 		return (error);
1654 	}
1655 
1656 	switch (filter->rr_subject_type) {
1657 	case RCTL_SUBJECT_TYPE_PROCESS:
1658 		p = filter->rr_subject.rs_proc;
1659 		if (p == NULL) {
1660 			error = EINVAL;
1661 			goto out;
1662 		}
1663 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1664 		break;
1665 	case RCTL_SUBJECT_TYPE_USER:
1666 		uip = filter->rr_subject.rs_uip;
1667 		if (uip == NULL) {
1668 			error = EINVAL;
1669 			goto out;
1670 		}
1671 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1672 		break;
1673 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1674 		lc = filter->rr_subject.rs_loginclass;
1675 		if (lc == NULL) {
1676 			error = EINVAL;
1677 			goto out;
1678 		}
1679 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1680 		break;
1681 	case RCTL_SUBJECT_TYPE_JAIL:
1682 		prr = filter->rr_subject.rs_prison_racct;
1683 		if (prr == NULL) {
1684 			error = EINVAL;
1685 			goto out;
1686 		}
1687 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1688 		break;
1689 	default:
1690 		error = EINVAL;
1691 	}
1692 out:
1693 	rctl_rule_release(filter);
1694 	sx_sunlock(&allproc_lock);
1695 	if (error != 0)
1696 		return (error);
1697 
1698 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1699 
1700 	return (error);
1701 }
1702 
1703 static void
1704 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1705 {
1706 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1707 	struct rctl_rule_link *link;
1708 	struct sbuf *sb = (struct sbuf *)arg3;
1709 
1710 	ASSERT_RACCT_ENABLED();
1711 	RCTL_LOCK_ASSERT();
1712 
1713 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1714 		if (!rctl_rule_matches(link->rrl_rule, filter))
1715 			continue;
1716 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1717 		sbuf_printf(sb, ",");
1718 	}
1719 }
1720 
1721 int
1722 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1723 {
1724 	int error;
1725 	size_t bufsize;
1726 	char *inputstr, *buf;
1727 	struct sbuf *sb;
1728 	struct rctl_rule *filter;
1729 	struct rctl_rule_link *link;
1730 	struct proc *p;
1731 
1732 	if (!racct_enable)
1733 		return (ENOSYS);
1734 
1735 	error = priv_check(td, PRIV_RCTL_GET_RULES);
1736 	if (error != 0)
1737 		return (error);
1738 
1739 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1740 	if (error != 0)
1741 		return (error);
1742 
1743 	sx_slock(&allproc_lock);
1744 	error = rctl_string_to_rule(inputstr, &filter);
1745 	free(inputstr, M_RCTL);
1746 	if (error != 0) {
1747 		sx_sunlock(&allproc_lock);
1748 		return (error);
1749 	}
1750 
1751 	bufsize = uap->outbuflen;
1752 	if (bufsize > rctl_maxbufsize) {
1753 		sx_sunlock(&allproc_lock);
1754 		return (E2BIG);
1755 	}
1756 
1757 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1758 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1759 	KASSERT(sb != NULL, ("sbuf_new failed"));
1760 
1761 	FOREACH_PROC_IN_SYSTEM(p) {
1762 		RCTL_RLOCK();
1763 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1764 			/*
1765 			 * Non-process rules will be added to the buffer later.
1766 			 * Adding them here would result in duplicated output.
1767 			 */
1768 			if (link->rrl_rule->rr_subject_type !=
1769 			    RCTL_SUBJECT_TYPE_PROCESS)
1770 				continue;
1771 			if (!rctl_rule_matches(link->rrl_rule, filter))
1772 				continue;
1773 			rctl_rule_to_sbuf(sb, link->rrl_rule);
1774 			sbuf_printf(sb, ",");
1775 		}
1776 		RCTL_RUNLOCK();
1777 	}
1778 
1779 	loginclass_racct_foreach(rctl_get_rules_callback,
1780 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1781 	    filter, sb);
1782 	ui_racct_foreach(rctl_get_rules_callback,
1783 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1784 	    filter, sb);
1785 	prison_racct_foreach(rctl_get_rules_callback,
1786 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1787 	    filter, sb);
1788 	if (sbuf_error(sb) == ENOMEM) {
1789 		error = ERANGE;
1790 		goto out;
1791 	}
1792 
1793 	/*
1794 	 * Remove trailing ",".
1795 	 */
1796 	if (sbuf_len(sb) > 0)
1797 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1798 
1799 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1800 out:
1801 	rctl_rule_release(filter);
1802 	sx_sunlock(&allproc_lock);
1803 	free(buf, M_RCTL);
1804 	return (error);
1805 }
1806 
1807 int
1808 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1809 {
1810 	int error;
1811 	size_t bufsize;
1812 	char *inputstr, *buf;
1813 	struct sbuf *sb;
1814 	struct rctl_rule *filter;
1815 	struct rctl_rule_link *link;
1816 
1817 	if (!racct_enable)
1818 		return (ENOSYS);
1819 
1820 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1821 	if (error != 0)
1822 		return (error);
1823 
1824 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1825 	if (error != 0)
1826 		return (error);
1827 
1828 	sx_slock(&allproc_lock);
1829 	error = rctl_string_to_rule(inputstr, &filter);
1830 	free(inputstr, M_RCTL);
1831 	if (error != 0) {
1832 		sx_sunlock(&allproc_lock);
1833 		return (error);
1834 	}
1835 
1836 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1837 		rctl_rule_release(filter);
1838 		sx_sunlock(&allproc_lock);
1839 		return (EINVAL);
1840 	}
1841 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1842 		rctl_rule_release(filter);
1843 		sx_sunlock(&allproc_lock);
1844 		return (EOPNOTSUPP);
1845 	}
1846 	if (filter->rr_subject.rs_proc == NULL) {
1847 		rctl_rule_release(filter);
1848 		sx_sunlock(&allproc_lock);
1849 		return (EINVAL);
1850 	}
1851 
1852 	bufsize = uap->outbuflen;
1853 	if (bufsize > rctl_maxbufsize) {
1854 		rctl_rule_release(filter);
1855 		sx_sunlock(&allproc_lock);
1856 		return (E2BIG);
1857 	}
1858 
1859 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1860 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1861 	KASSERT(sb != NULL, ("sbuf_new failed"));
1862 
1863 	RCTL_RLOCK();
1864 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1865 	    rrl_next) {
1866 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1867 		sbuf_printf(sb, ",");
1868 	}
1869 	RCTL_RUNLOCK();
1870 	if (sbuf_error(sb) == ENOMEM) {
1871 		error = ERANGE;
1872 		goto out;
1873 	}
1874 
1875 	/*
1876 	 * Remove trailing ",".
1877 	 */
1878 	if (sbuf_len(sb) > 0)
1879 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1880 
1881 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1882 out:
1883 	rctl_rule_release(filter);
1884 	sx_sunlock(&allproc_lock);
1885 	free(buf, M_RCTL);
1886 	return (error);
1887 }
1888 
1889 int
1890 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1891 {
1892 	int error;
1893 	struct rctl_rule *rule;
1894 	char *inputstr;
1895 
1896 	if (!racct_enable)
1897 		return (ENOSYS);
1898 
1899 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1900 	if (error != 0)
1901 		return (error);
1902 
1903 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1904 	if (error != 0)
1905 		return (error);
1906 
1907 	sx_slock(&allproc_lock);
1908 	error = rctl_string_to_rule(inputstr, &rule);
1909 	free(inputstr, M_RCTL);
1910 	if (error != 0) {
1911 		sx_sunlock(&allproc_lock);
1912 		return (error);
1913 	}
1914 	/*
1915 	 * The 'per' part of a rule is optional.
1916 	 */
1917 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1918 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1919 		rule->rr_per = rule->rr_subject_type;
1920 
1921 	if (!rctl_rule_fully_specified(rule)) {
1922 		error = EINVAL;
1923 		goto out;
1924 	}
1925 
1926 	error = rctl_rule_add(rule);
1927 
1928 out:
1929 	rctl_rule_release(rule);
1930 	sx_sunlock(&allproc_lock);
1931 	return (error);
1932 }
1933 
1934 int
1935 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1936 {
1937 	int error;
1938 	struct rctl_rule *filter;
1939 	char *inputstr;
1940 
1941 	if (!racct_enable)
1942 		return (ENOSYS);
1943 
1944 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1945 	if (error != 0)
1946 		return (error);
1947 
1948 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1949 	if (error != 0)
1950 		return (error);
1951 
1952 	sx_slock(&allproc_lock);
1953 	error = rctl_string_to_rule(inputstr, &filter);
1954 	free(inputstr, M_RCTL);
1955 	if (error != 0) {
1956 		sx_sunlock(&allproc_lock);
1957 		return (error);
1958 	}
1959 
1960 	error = rctl_rule_remove(filter);
1961 	rctl_rule_release(filter);
1962 	sx_sunlock(&allproc_lock);
1963 
1964 	return (error);
1965 }
1966 
1967 /*
1968  * Update RCTL rule list after credential change.
1969  */
1970 void
1971 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1972 {
1973 	int rulecnt, i;
1974 	struct rctl_rule_link *link, *newlink;
1975 	struct uidinfo *newuip;
1976 	struct loginclass *newlc;
1977 	struct prison_racct *newprr;
1978 	LIST_HEAD(, rctl_rule_link) newrules;
1979 
1980 	ASSERT_RACCT_ENABLED();
1981 
1982 	newuip = newcred->cr_ruidinfo;
1983 	newlc = newcred->cr_loginclass;
1984 	newprr = newcred->cr_prison->pr_prison_racct;
1985 
1986 	LIST_INIT(&newrules);
1987 
1988 again:
1989 	/*
1990 	 * First, count the rules that apply to the process with new
1991 	 * credentials.
1992 	 */
1993 	rulecnt = 0;
1994 	RCTL_RLOCK();
1995 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1996 		if (link->rrl_rule->rr_subject_type ==
1997 		    RCTL_SUBJECT_TYPE_PROCESS)
1998 			rulecnt++;
1999 	}
2000 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
2001 		rulecnt++;
2002 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
2003 		rulecnt++;
2004 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
2005 		rulecnt++;
2006 	RCTL_RUNLOCK();
2007 
2008 	/*
2009 	 * Create temporary list.  We've dropped the rctl_lock in order
2010 	 * to use M_WAITOK.
2011 	 */
2012 	for (i = 0; i < rulecnt; i++) {
2013 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2014 		newlink->rrl_rule = NULL;
2015 		newlink->rrl_exceeded = 0;
2016 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2017 	}
2018 
2019 	newlink = LIST_FIRST(&newrules);
2020 
2021 	/*
2022 	 * Assign rules to the newly allocated list entries.
2023 	 */
2024 	RCTL_WLOCK();
2025 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2026 		if (link->rrl_rule->rr_subject_type ==
2027 		    RCTL_SUBJECT_TYPE_PROCESS) {
2028 			if (newlink == NULL)
2029 				goto goaround;
2030 			rctl_rule_acquire(link->rrl_rule);
2031 			newlink->rrl_rule = link->rrl_rule;
2032 			newlink->rrl_exceeded = link->rrl_exceeded;
2033 			newlink = LIST_NEXT(newlink, rrl_next);
2034 			rulecnt--;
2035 		}
2036 	}
2037 
2038 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2039 		if (newlink == NULL)
2040 			goto goaround;
2041 		rctl_rule_acquire(link->rrl_rule);
2042 		newlink->rrl_rule = link->rrl_rule;
2043 		newlink->rrl_exceeded = link->rrl_exceeded;
2044 		newlink = LIST_NEXT(newlink, rrl_next);
2045 		rulecnt--;
2046 	}
2047 
2048 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2049 		if (newlink == NULL)
2050 			goto goaround;
2051 		rctl_rule_acquire(link->rrl_rule);
2052 		newlink->rrl_rule = link->rrl_rule;
2053 		newlink->rrl_exceeded = link->rrl_exceeded;
2054 		newlink = LIST_NEXT(newlink, rrl_next);
2055 		rulecnt--;
2056 	}
2057 
2058 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2059 		if (newlink == NULL)
2060 			goto goaround;
2061 		rctl_rule_acquire(link->rrl_rule);
2062 		newlink->rrl_rule = link->rrl_rule;
2063 		newlink->rrl_exceeded = link->rrl_exceeded;
2064 		newlink = LIST_NEXT(newlink, rrl_next);
2065 		rulecnt--;
2066 	}
2067 
2068 	if (rulecnt == 0) {
2069 		/*
2070 		 * Free the old rule list.
2071 		 */
2072 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2073 			link = LIST_FIRST(&p->p_racct->r_rule_links);
2074 			LIST_REMOVE(link, rrl_next);
2075 			rctl_rule_release(link->rrl_rule);
2076 			uma_zfree(rctl_rule_link_zone, link);
2077 		}
2078 
2079 		/*
2080 		 * Replace lists and we're done.
2081 		 *
2082 		 * XXX: Is there any way to switch list heads instead
2083 		 *      of iterating here?
2084 		 */
2085 		while (!LIST_EMPTY(&newrules)) {
2086 			newlink = LIST_FIRST(&newrules);
2087 			LIST_REMOVE(newlink, rrl_next);
2088 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2089 			    newlink, rrl_next);
2090 		}
2091 
2092 		RCTL_WUNLOCK();
2093 
2094 		return;
2095 	}
2096 
2097 goaround:
2098 	RCTL_WUNLOCK();
2099 
2100 	/*
2101 	 * Rule list changed while we were not holding the rctl_lock.
2102 	 * Free the new list and try again.
2103 	 */
2104 	while (!LIST_EMPTY(&newrules)) {
2105 		newlink = LIST_FIRST(&newrules);
2106 		LIST_REMOVE(newlink, rrl_next);
2107 		if (newlink->rrl_rule != NULL)
2108 			rctl_rule_release(newlink->rrl_rule);
2109 		uma_zfree(rctl_rule_link_zone, newlink);
2110 	}
2111 
2112 	goto again;
2113 }
2114 
2115 /*
2116  * Assign RCTL rules to the newly created process.
2117  */
2118 int
2119 rctl_proc_fork(struct proc *parent, struct proc *child)
2120 {
2121 	int error;
2122 	struct rctl_rule_link *link;
2123 	struct rctl_rule *rule;
2124 
2125 	LIST_INIT(&child->p_racct->r_rule_links);
2126 
2127 	ASSERT_RACCT_ENABLED();
2128 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2129 
2130 	RCTL_WLOCK();
2131 
2132 	/*
2133 	 * Go through limits applicable to the parent and assign them
2134 	 * to the child.  Rules with 'process' subject have to be duplicated
2135 	 * in order to make their rr_subject point to the new process.
2136 	 */
2137 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2138 		if (link->rrl_rule->rr_subject_type ==
2139 		    RCTL_SUBJECT_TYPE_PROCESS) {
2140 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2141 			if (rule == NULL)
2142 				goto fail;
2143 			KASSERT(rule->rr_subject.rs_proc == parent,
2144 			    ("rule->rr_subject.rs_proc != parent"));
2145 			rule->rr_subject.rs_proc = child;
2146 			error = rctl_racct_add_rule_locked(child->p_racct,
2147 			    rule);
2148 			rctl_rule_release(rule);
2149 			if (error != 0)
2150 				goto fail;
2151 		} else {
2152 			error = rctl_racct_add_rule_locked(child->p_racct,
2153 			    link->rrl_rule);
2154 			if (error != 0)
2155 				goto fail;
2156 		}
2157 	}
2158 
2159 	RCTL_WUNLOCK();
2160 	return (0);
2161 
2162 fail:
2163 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2164 		link = LIST_FIRST(&child->p_racct->r_rule_links);
2165 		LIST_REMOVE(link, rrl_next);
2166 		rctl_rule_release(link->rrl_rule);
2167 		uma_zfree(rctl_rule_link_zone, link);
2168 	}
2169 	RCTL_WUNLOCK();
2170 	return (EAGAIN);
2171 }
2172 
2173 /*
2174  * Release rules attached to the racct.
2175  */
2176 void
2177 rctl_racct_release(struct racct *racct)
2178 {
2179 	struct rctl_rule_link *link;
2180 
2181 	ASSERT_RACCT_ENABLED();
2182 
2183 	RCTL_WLOCK();
2184 	while (!LIST_EMPTY(&racct->r_rule_links)) {
2185 		link = LIST_FIRST(&racct->r_rule_links);
2186 		LIST_REMOVE(link, rrl_next);
2187 		rctl_rule_release(link->rrl_rule);
2188 		uma_zfree(rctl_rule_link_zone, link);
2189 	}
2190 	RCTL_WUNLOCK();
2191 }
2192 
2193 static void
2194 rctl_init(void)
2195 {
2196 
2197 	if (!racct_enable)
2198 		return;
2199 
2200 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2201 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2202 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
2203 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2204 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
2205 
2206 	/*
2207 	 * Set default values, making sure not to overwrite the ones
2208 	 * fetched from tunables.  Most of those could be set at the
2209 	 * declaration, except for the rctl_throttle_max - we cannot
2210 	 * set it there due to hz not being compile time constant.
2211 	 */
2212 	if (rctl_throttle_min < 1)
2213 		rctl_throttle_min = 1;
2214 	if (rctl_throttle_max < rctl_throttle_min)
2215 		rctl_throttle_max = 2 * hz;
2216 	if (rctl_throttle_pct < 0)
2217 		rctl_throttle_pct = 100;
2218 	if (rctl_throttle_pct2 < 0)
2219 		rctl_throttle_pct2 = 100;
2220 }
2221 
2222 #else /* !RCTL */
2223 
2224 int
2225 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2226 {
2227 
2228 	return (ENOSYS);
2229 }
2230 
2231 int
2232 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2233 {
2234 
2235 	return (ENOSYS);
2236 }
2237 
2238 int
2239 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2240 {
2241 
2242 	return (ENOSYS);
2243 }
2244 
2245 int
2246 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2247 {
2248 
2249 	return (ENOSYS);
2250 }
2251 
2252 int
2253 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2254 {
2255 
2256 	return (ENOSYS);
2257 }
2258 
2259 #endif /* !RCTL */
2260