xref: /freebsd/sys/kern/kern_jail.c (revision f0a75d274af375d15b97b830966b99a02b7db911)
1 /*-
2  * ----------------------------------------------------------------------------
3  * "THE BEER-WARE LICENSE" (Revision 42):
4  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5  * can do whatever you want with this stuff. If we meet some day, and you think
6  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7  * ----------------------------------------------------------------------------
8  */
9 
10 #include <sys/cdefs.h>
11 __FBSDID("$FreeBSD$");
12 
13 #include "opt_mac.h"
14 
15 #include <sys/param.h>
16 #include <sys/types.h>
17 #include <sys/kernel.h>
18 #include <sys/systm.h>
19 #include <sys/errno.h>
20 #include <sys/sysproto.h>
21 #include <sys/malloc.h>
22 #include <sys/priv.h>
23 #include <sys/proc.h>
24 #include <sys/taskqueue.h>
25 #include <sys/jail.h>
26 #include <sys/lock.h>
27 #include <sys/mutex.h>
28 #include <sys/sx.h>
29 #include <sys/namei.h>
30 #include <sys/mount.h>
31 #include <sys/queue.h>
32 #include <sys/socket.h>
33 #include <sys/syscallsubr.h>
34 #include <sys/sysctl.h>
35 #include <sys/vnode.h>
36 #include <net/if.h>
37 #include <netinet/in.h>
38 
39 #include <security/mac/mac_framework.h>
40 
41 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
42 
43 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
44     "Jail rules");
45 
46 int	jail_set_hostname_allowed = 1;
47 SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
48     &jail_set_hostname_allowed, 0,
49     "Processes in jail can set their hostnames");
50 
51 int	jail_socket_unixiproute_only = 1;
52 SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
53     &jail_socket_unixiproute_only, 0,
54     "Processes in jail are limited to creating UNIX/IPv4/route sockets only");
55 
56 int	jail_sysvipc_allowed = 0;
57 SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
58     &jail_sysvipc_allowed, 0,
59     "Processes in jail can use System V IPC primitives");
60 
61 static int jail_enforce_statfs = 2;
62 SYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
63     &jail_enforce_statfs, 0,
64     "Processes in jail cannot see all mounted file systems");
65 
66 int	jail_allow_raw_sockets = 0;
67 SYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
68     &jail_allow_raw_sockets, 0,
69     "Prison root can create raw sockets");
70 
71 int	jail_chflags_allowed = 0;
72 SYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
73     &jail_chflags_allowed, 0,
74     "Processes in jail can alter system file flags");
75 
76 int	jail_mount_allowed = 0;
77 SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
78     &jail_mount_allowed, 0,
79     "Processes in jail can mount/unmount jail-friendly file systems");
80 
81 /* allprison, lastprid, and prisoncount are protected by allprison_lock. */
82 struct	prisonlist allprison;
83 struct	sx allprison_lock;
84 int	lastprid = 0;
85 int	prisoncount = 0;
86 
87 /*
88  * List of jail services. Protected by allprison_lock.
89  */
90 TAILQ_HEAD(prison_services_head, prison_service);
91 static struct prison_services_head prison_services =
92     TAILQ_HEAD_INITIALIZER(prison_services);
93 static int prison_service_slots = 0;
94 
95 struct prison_service {
96 	prison_create_t ps_create;
97 	prison_destroy_t ps_destroy;
98 	int		ps_slotno;
99 	TAILQ_ENTRY(prison_service) ps_next;
100 	char	ps_name[0];
101 };
102 
103 static void		 init_prison(void *);
104 static void		 prison_complete(void *context, int pending);
105 static int		 sysctl_jail_list(SYSCTL_HANDLER_ARGS);
106 
107 static void
108 init_prison(void *data __unused)
109 {
110 
111 	sx_init(&allprison_lock, "allprison");
112 	LIST_INIT(&allprison);
113 }
114 
115 SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
116 
117 /*
118  * struct jail_args {
119  *	struct jail *jail;
120  * };
121  */
122 int
123 jail(struct thread *td, struct jail_args *uap)
124 {
125 	struct nameidata nd;
126 	struct prison *pr, *tpr;
127 	struct prison_service *psrv;
128 	struct jail j;
129 	struct jail_attach_args jaa;
130 	int vfslocked, error, tryprid;
131 
132 	error = copyin(uap->jail, &j, sizeof(j));
133 	if (error)
134 		return (error);
135 	if (j.version != 0)
136 		return (EINVAL);
137 
138 	MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
139 	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
140 	pr->pr_ref = 1;
141 	error = copyinstr(j.path, &pr->pr_path, sizeof(pr->pr_path), 0);
142 	if (error)
143 		goto e_killmtx;
144 	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE,
145 	    pr->pr_path, td);
146 	error = namei(&nd);
147 	if (error)
148 		goto e_killmtx;
149 	vfslocked = NDHASGIANT(&nd);
150 	pr->pr_root = nd.ni_vp;
151 	VOP_UNLOCK(nd.ni_vp, 0, td);
152 	NDFREE(&nd, NDF_ONLY_PNBUF);
153 	VFS_UNLOCK_GIANT(vfslocked);
154 	error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0);
155 	if (error)
156 		goto e_dropvnref;
157 	pr->pr_ip = j.ip_number;
158 	pr->pr_linux = NULL;
159 	pr->pr_securelevel = securelevel;
160 	if (prison_service_slots == 0)
161 		pr->pr_slots = NULL;
162 	else {
163 		pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots,
164 		    M_PRISON, M_ZERO | M_WAITOK);
165 	}
166 
167 	/* Determine next pr_id and add prison to allprison list. */
168 	sx_xlock(&allprison_lock);
169 	tryprid = lastprid + 1;
170 	if (tryprid == JAIL_MAX)
171 		tryprid = 1;
172 next:
173 	LIST_FOREACH(tpr, &allprison, pr_list) {
174 		if (tpr->pr_id == tryprid) {
175 			tryprid++;
176 			if (tryprid == JAIL_MAX) {
177 				sx_xunlock(&allprison_lock);
178 				error = EAGAIN;
179 				goto e_dropvnref;
180 			}
181 			goto next;
182 		}
183 	}
184 	pr->pr_id = jaa.jid = lastprid = tryprid;
185 	LIST_INSERT_HEAD(&allprison, pr, pr_list);
186 	prisoncount++;
187 	sx_downgrade(&allprison_lock);
188 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
189 		psrv->ps_create(psrv, pr);
190 	}
191 	sx_sunlock(&allprison_lock);
192 
193 	error = jail_attach(td, &jaa);
194 	if (error)
195 		goto e_dropprref;
196 	mtx_lock(&pr->pr_mtx);
197 	pr->pr_ref--;
198 	mtx_unlock(&pr->pr_mtx);
199 	td->td_retval[0] = jaa.jid;
200 	return (0);
201 e_dropprref:
202 	sx_xlock(&allprison_lock);
203 	LIST_REMOVE(pr, pr_list);
204 	prisoncount--;
205 	sx_downgrade(&allprison_lock);
206 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
207 		psrv->ps_destroy(psrv, pr);
208 	}
209 	sx_sunlock(&allprison_lock);
210 e_dropvnref:
211 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
212 	vrele(pr->pr_root);
213 	VFS_UNLOCK_GIANT(vfslocked);
214 e_killmtx:
215 	mtx_destroy(&pr->pr_mtx);
216 	FREE(pr, M_PRISON);
217 	return (error);
218 }
219 
220 /*
221  * struct jail_attach_args {
222  *	int jid;
223  * };
224  */
225 int
226 jail_attach(struct thread *td, struct jail_attach_args *uap)
227 {
228 	struct proc *p;
229 	struct ucred *newcred, *oldcred;
230 	struct prison *pr;
231 	int vfslocked, error;
232 
233 	/*
234 	 * XXX: Note that there is a slight race here if two threads
235 	 * in the same privileged process attempt to attach to two
236 	 * different jails at the same time.  It is important for
237 	 * user processes not to do this, or they might end up with
238 	 * a process root from one prison, but attached to the jail
239 	 * of another.
240 	 */
241 	error = priv_check(td, PRIV_JAIL_ATTACH);
242 	if (error)
243 		return (error);
244 
245 	p = td->td_proc;
246 	sx_slock(&allprison_lock);
247 	pr = prison_find(uap->jid);
248 	if (pr == NULL) {
249 		sx_sunlock(&allprison_lock);
250 		return (EINVAL);
251 	}
252 	pr->pr_ref++;
253 	mtx_unlock(&pr->pr_mtx);
254 	sx_sunlock(&allprison_lock);
255 
256 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
257 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY, td);
258 	if ((error = change_dir(pr->pr_root, td)) != 0)
259 		goto e_unlock;
260 #ifdef MAC
261 	if ((error = mac_check_vnode_chroot(td->td_ucred, pr->pr_root)))
262 		goto e_unlock;
263 #endif
264 	VOP_UNLOCK(pr->pr_root, 0, td);
265 	change_root(pr->pr_root, td);
266 	VFS_UNLOCK_GIANT(vfslocked);
267 
268 	newcred = crget();
269 	PROC_LOCK(p);
270 	oldcred = p->p_ucred;
271 	setsugid(p);
272 	crcopy(newcred, oldcred);
273 	newcred->cr_prison = pr;
274 	p->p_ucred = newcred;
275 	PROC_UNLOCK(p);
276 	crfree(oldcred);
277 	return (0);
278 e_unlock:
279 	VOP_UNLOCK(pr->pr_root, 0, td);
280 	VFS_UNLOCK_GIANT(vfslocked);
281 	mtx_lock(&pr->pr_mtx);
282 	pr->pr_ref--;
283 	mtx_unlock(&pr->pr_mtx);
284 	return (error);
285 }
286 
287 /*
288  * Returns a locked prison instance, or NULL on failure.
289  */
290 struct prison *
291 prison_find(int prid)
292 {
293 	struct prison *pr;
294 
295 	sx_assert(&allprison_lock, SX_LOCKED);
296 	LIST_FOREACH(pr, &allprison, pr_list) {
297 		if (pr->pr_id == prid) {
298 			mtx_lock(&pr->pr_mtx);
299 			return (pr);
300 		}
301 	}
302 	return (NULL);
303 }
304 
305 void
306 prison_free(struct prison *pr)
307 {
308 	struct prison_service *psrv;
309 
310 	sx_xlock(&allprison_lock);
311 	mtx_lock(&pr->pr_mtx);
312 	pr->pr_ref--;
313 	if (pr->pr_ref == 0) {
314 		LIST_REMOVE(pr, pr_list);
315 		mtx_unlock(&pr->pr_mtx);
316 		prisoncount--;
317 		sx_downgrade(&allprison_lock);
318 		TAILQ_FOREACH(psrv, &prison_services, ps_next) {
319 			psrv->ps_destroy(psrv, pr);
320 		}
321 		sx_sunlock(&allprison_lock);
322 
323 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
324 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
325 		return;
326 	}
327 	mtx_unlock(&pr->pr_mtx);
328 	sx_xunlock(&allprison_lock);
329 }
330 
331 static void
332 prison_complete(void *context, int pending)
333 {
334 	struct prison *pr;
335 	int vfslocked;
336 
337 	pr = (struct prison *)context;
338 
339 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
340 	vrele(pr->pr_root);
341 	VFS_UNLOCK_GIANT(vfslocked);
342 
343 	mtx_destroy(&pr->pr_mtx);
344 	if (pr->pr_linux != NULL)
345 		FREE(pr->pr_linux, M_PRISON);
346 	FREE(pr, M_PRISON);
347 }
348 
349 void
350 prison_hold(struct prison *pr)
351 {
352 
353 	mtx_lock(&pr->pr_mtx);
354 	pr->pr_ref++;
355 	mtx_unlock(&pr->pr_mtx);
356 }
357 
358 u_int32_t
359 prison_getip(struct ucred *cred)
360 {
361 
362 	return (cred->cr_prison->pr_ip);
363 }
364 
365 int
366 prison_ip(struct ucred *cred, int flag, u_int32_t *ip)
367 {
368 	u_int32_t tmp;
369 
370 	if (!jailed(cred))
371 		return (0);
372 	if (flag)
373 		tmp = *ip;
374 	else
375 		tmp = ntohl(*ip);
376 	if (tmp == INADDR_ANY) {
377 		if (flag)
378 			*ip = cred->cr_prison->pr_ip;
379 		else
380 			*ip = htonl(cred->cr_prison->pr_ip);
381 		return (0);
382 	}
383 	if (tmp == INADDR_LOOPBACK) {
384 		if (flag)
385 			*ip = cred->cr_prison->pr_ip;
386 		else
387 			*ip = htonl(cred->cr_prison->pr_ip);
388 		return (0);
389 	}
390 	if (cred->cr_prison->pr_ip != tmp)
391 		return (1);
392 	return (0);
393 }
394 
395 void
396 prison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip)
397 {
398 	u_int32_t tmp;
399 
400 	if (!jailed(cred))
401 		return;
402 	if (flag)
403 		tmp = *ip;
404 	else
405 		tmp = ntohl(*ip);
406 	if (tmp == INADDR_LOOPBACK) {
407 		if (flag)
408 			*ip = cred->cr_prison->pr_ip;
409 		else
410 			*ip = htonl(cred->cr_prison->pr_ip);
411 		return;
412 	}
413 	return;
414 }
415 
416 int
417 prison_if(struct ucred *cred, struct sockaddr *sa)
418 {
419 	struct sockaddr_in *sai;
420 	int ok;
421 
422 	sai = (struct sockaddr_in *)sa;
423 	if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
424 		ok = 1;
425 	else if (sai->sin_family != AF_INET)
426 		ok = 0;
427 	else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
428 		ok = 1;
429 	else
430 		ok = 0;
431 	return (ok);
432 }
433 
434 /*
435  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
436  */
437 int
438 prison_check(struct ucred *cred1, struct ucred *cred2)
439 {
440 
441 	if (jailed(cred1)) {
442 		if (!jailed(cred2))
443 			return (ESRCH);
444 		if (cred2->cr_prison != cred1->cr_prison)
445 			return (ESRCH);
446 	}
447 
448 	return (0);
449 }
450 
451 /*
452  * Return 1 if the passed credential is in a jail, otherwise 0.
453  */
454 int
455 jailed(struct ucred *cred)
456 {
457 
458 	return (cred->cr_prison != NULL);
459 }
460 
461 /*
462  * Return the correct hostname for the passed credential.
463  */
464 void
465 getcredhostname(struct ucred *cred, char *buf, size_t size)
466 {
467 
468 	if (jailed(cred)) {
469 		mtx_lock(&cred->cr_prison->pr_mtx);
470 		strlcpy(buf, cred->cr_prison->pr_host, size);
471 		mtx_unlock(&cred->cr_prison->pr_mtx);
472 	} else
473 		strlcpy(buf, hostname, size);
474 }
475 
476 /*
477  * Determine whether the subject represented by cred can "see"
478  * status of a mount point.
479  * Returns: 0 for permitted, ENOENT otherwise.
480  * XXX: This function should be called cr_canseemount() and should be
481  *      placed in kern_prot.c.
482  */
483 int
484 prison_canseemount(struct ucred *cred, struct mount *mp)
485 {
486 	struct prison *pr;
487 	struct statfs *sp;
488 	size_t len;
489 
490 	if (!jailed(cred) || jail_enforce_statfs == 0)
491 		return (0);
492 	pr = cred->cr_prison;
493 	if (pr->pr_root->v_mount == mp)
494 		return (0);
495 	if (jail_enforce_statfs == 2)
496 		return (ENOENT);
497 	/*
498 	 * If jail's chroot directory is set to "/" we should be able to see
499 	 * all mount-points from inside a jail.
500 	 * This is ugly check, but this is the only situation when jail's
501 	 * directory ends with '/'.
502 	 */
503 	if (strcmp(pr->pr_path, "/") == 0)
504 		return (0);
505 	len = strlen(pr->pr_path);
506 	sp = &mp->mnt_stat;
507 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
508 		return (ENOENT);
509 	/*
510 	 * Be sure that we don't have situation where jail's root directory
511 	 * is "/some/path" and mount point is "/some/pathpath".
512 	 */
513 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
514 		return (ENOENT);
515 	return (0);
516 }
517 
518 void
519 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
520 {
521 	char jpath[MAXPATHLEN];
522 	struct prison *pr;
523 	size_t len;
524 
525 	if (!jailed(cred) || jail_enforce_statfs == 0)
526 		return;
527 	pr = cred->cr_prison;
528 	if (prison_canseemount(cred, mp) != 0) {
529 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
530 		strlcpy(sp->f_mntonname, "[restricted]",
531 		    sizeof(sp->f_mntonname));
532 		return;
533 	}
534 	if (pr->pr_root->v_mount == mp) {
535 		/*
536 		 * Clear current buffer data, so we are sure nothing from
537 		 * the valid path left there.
538 		 */
539 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
540 		*sp->f_mntonname = '/';
541 		return;
542 	}
543 	/*
544 	 * If jail's chroot directory is set to "/" we should be able to see
545 	 * all mount-points from inside a jail.
546 	 */
547 	if (strcmp(pr->pr_path, "/") == 0)
548 		return;
549 	len = strlen(pr->pr_path);
550 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
551 	/*
552 	 * Clear current buffer data, so we are sure nothing from
553 	 * the valid path left there.
554 	 */
555 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
556 	if (*jpath == '\0') {
557 		/* Should never happen. */
558 		*sp->f_mntonname = '/';
559 	} else {
560 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
561 	}
562 }
563 
564 /*
565  * Check with permission for a specific privilege is granted within jail.  We
566  * have a specific list of accepted privileges; the rest are denied.
567  */
568 int
569 prison_priv_check(struct ucred *cred, int priv)
570 {
571 
572 	if (!jailed(cred))
573 		return (0);
574 
575 	switch (priv) {
576 
577 		/*
578 		 * Allow ktrace privileges for root in jail.
579 		 */
580 	case PRIV_KTRACE:
581 
582 #if 0
583 		/*
584 		 * Allow jailed processes to configure audit identity and
585 		 * submit audit records (login, etc).  In the future we may
586 		 * want to further refine the relationship between audit and
587 		 * jail.
588 		 */
589 	case PRIV_AUDIT_GETAUDIT:
590 	case PRIV_AUDIT_SETAUDIT:
591 	case PRIV_AUDIT_SUBMIT:
592 #endif
593 
594 		/*
595 		 * Allow jailed processes to manipulate process UNIX
596 		 * credentials in any way they see fit.
597 		 */
598 	case PRIV_CRED_SETUID:
599 	case PRIV_CRED_SETEUID:
600 	case PRIV_CRED_SETGID:
601 	case PRIV_CRED_SETEGID:
602 	case PRIV_CRED_SETGROUPS:
603 	case PRIV_CRED_SETREUID:
604 	case PRIV_CRED_SETREGID:
605 	case PRIV_CRED_SETRESUID:
606 	case PRIV_CRED_SETRESGID:
607 
608 		/*
609 		 * Jail implements visibility constraints already, so allow
610 		 * jailed root to override uid/gid-based constraints.
611 		 */
612 	case PRIV_SEEOTHERGIDS:
613 	case PRIV_SEEOTHERUIDS:
614 
615 		/*
616 		 * Jail implements inter-process debugging limits already, so
617 		 * allow jailed root various debugging privileges.
618 		 */
619 	case PRIV_DEBUG_DIFFCRED:
620 	case PRIV_DEBUG_SUGID:
621 	case PRIV_DEBUG_UNPRIV:
622 
623 		/*
624 		 * Allow jail to set various resource limits and login
625 		 * properties, and for now, exceed process resource limits.
626 		 */
627 	case PRIV_PROC_LIMIT:
628 	case PRIV_PROC_SETLOGIN:
629 	case PRIV_PROC_SETRLIMIT:
630 
631 		/*
632 		 * System V and POSIX IPC privileges are granted in jail.
633 		 */
634 	case PRIV_IPC_READ:
635 	case PRIV_IPC_WRITE:
636 	case PRIV_IPC_ADMIN:
637 	case PRIV_IPC_MSGSIZE:
638 	case PRIV_MQ_ADMIN:
639 
640 		/*
641 		 * Jail implements its own inter-process limits, so allow
642 		 * root processes in jail to change scheduling on other
643 		 * processes in the same jail.  Likewise for signalling.
644 		 */
645 	case PRIV_SCHED_DIFFCRED:
646 	case PRIV_SIGNAL_DIFFCRED:
647 	case PRIV_SIGNAL_SUGID:
648 
649 		/*
650 		 * Allow jailed processes to write to sysctls marked as jail
651 		 * writable.
652 		 */
653 	case PRIV_SYSCTL_WRITEJAIL:
654 
655 		/*
656 		 * Allow root in jail to manage a variety of quota
657 		 * properties.  These should likely be conditional on a
658 		 * configuration option.
659 		 */
660 	case PRIV_VFS_GETQUOTA:
661 	case PRIV_VFS_SETQUOTA:
662 
663 		/*
664 		 * Since Jail relies on chroot() to implement file system
665 		 * protections, grant many VFS privileges to root in jail.
666 		 * Be careful to exclude mount-related and NFS-related
667 		 * privileges.
668 		 */
669 	case PRIV_VFS_READ:
670 	case PRIV_VFS_WRITE:
671 	case PRIV_VFS_ADMIN:
672 	case PRIV_VFS_EXEC:
673 	case PRIV_VFS_LOOKUP:
674 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
675 	case PRIV_VFS_CHFLAGS_DEV:
676 	case PRIV_VFS_CHOWN:
677 	case PRIV_VFS_CHROOT:
678 	case PRIV_VFS_RETAINSUGID:
679 	case PRIV_VFS_FCHROOT:
680 	case PRIV_VFS_LINK:
681 	case PRIV_VFS_SETGID:
682 	case PRIV_VFS_STICKYFILE:
683 		return (0);
684 
685 		/*
686 		 * Depending on the global setting, allow privilege of
687 		 * setting system flags.
688 		 */
689 	case PRIV_VFS_SYSFLAGS:
690 		if (jail_chflags_allowed)
691 			return (0);
692 		else
693 			return (EPERM);
694 
695 		/*
696 		 * Depending on the global setting, allow privilege of
697 		 * mounting/unmounting file systems.
698 		 */
699 	case PRIV_VFS_MOUNT:
700 	case PRIV_VFS_UNMOUNT:
701 	case PRIV_VFS_MOUNT_NONUSER:
702 		if (jail_mount_allowed)
703 			return (0);
704 		else
705 			return (EPERM);
706 
707 		/*
708 		 * Allow jailed root to bind reserved ports.
709 		 */
710 	case PRIV_NETINET_RESERVEDPORT:
711 		return (0);
712 
713 		/*
714 		 * Conditionally allow creating raw sockets in jail.
715 		 */
716 	case PRIV_NETINET_RAW:
717 		if (jail_allow_raw_sockets)
718 			return (0);
719 		else
720 			return (EPERM);
721 
722 		/*
723 		 * Since jail implements its own visibility limits on netstat
724 		 * sysctls, allow getcred.  This allows identd to work in
725 		 * jail.
726 		 */
727 	case PRIV_NETINET_GETCRED:
728 		return (0);
729 
730 	default:
731 		/*
732 		 * In all remaining cases, deny the privilege request.  This
733 		 * includes almost all network privileges, many system
734 		 * configuration privileges.
735 		 */
736 		return (EPERM);
737 	}
738 }
739 
740 /*
741  * Register jail service. Provides 'create' and 'destroy' methods.
742  * 'create' method will be called for every existing jail and all
743  * jails in the future as they beeing created.
744  * 'destroy' method will be called for every jail going away and
745  * for all existing jails at the time of service deregistration.
746  */
747 struct prison_service *
748 prison_service_register(const char *name, prison_create_t create,
749     prison_destroy_t destroy)
750 {
751 	struct prison_service *psrv, *psrv2;
752 	struct prison *pr;
753 	int reallocate = 1, slotno = 0;
754 	void **slots, **oldslots;
755 
756 	psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
757 	    M_WAITOK | M_ZERO);
758 	psrv->ps_create = create;
759 	psrv->ps_destroy = destroy;
760 	strcpy(psrv->ps_name, name);
761 	/*
762 	 * Grab the allprison_lock here, so we won't miss any jail
763 	 * creation/destruction.
764 	 */
765 	sx_xlock(&allprison_lock);
766 #ifdef INVARIANTS
767 	/*
768 	 * Verify if service is not already registered.
769 	 */
770 	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
771 		KASSERT(strcmp(psrv2->ps_name, name) != 0,
772 		    ("jail service %s already registered", name));
773 	}
774 #endif
775 	/*
776 	 * Find free slot. When there is no existing free slot available,
777 	 * allocate one at the end.
778 	 */
779 	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
780 		if (psrv2->ps_slotno != slotno) {
781 			KASSERT(slotno < psrv2->ps_slotno,
782 			    ("Invalid slotno (slotno=%d >= ps_slotno=%d",
783 			    slotno, psrv2->ps_slotno));
784 			/* We found free slot. */
785 			reallocate = 0;
786 			break;
787 		}
788 		slotno++;
789 	}
790 	psrv->ps_slotno = slotno;
791 	/*
792 	 * Keep the list sorted by slot number.
793 	 */
794 	if (psrv2 != NULL) {
795 		KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
796 		TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
797 	} else {
798 		KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
799 		TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
800 	}
801 	prison_service_slots++;
802 	sx_downgrade(&allprison_lock);
803 	/*
804 	 * Allocate memory for new slot if we didn't found empty one.
805 	 * Do not use realloc(9), because pr_slots is protected with a mutex,
806 	 * so we can't sleep.
807 	 */
808 	LIST_FOREACH(pr, &allprison, pr_list) {
809 		if (reallocate) {
810 			/* First allocate memory with M_WAITOK. */
811 			slots = malloc(sizeof(*slots) * prison_service_slots,
812 			    M_PRISON, M_WAITOK);
813 			/* Now grab the mutex and replace pr_slots. */
814 			mtx_lock(&pr->pr_mtx);
815 			oldslots = pr->pr_slots;
816 			if (psrv->ps_slotno > 0) {
817 				bcopy(oldslots, slots,
818 				    sizeof(*slots) * (prison_service_slots - 1));
819 			}
820 			slots[psrv->ps_slotno] = NULL;
821 			pr->pr_slots = slots;
822 			mtx_unlock(&pr->pr_mtx);
823 			if (oldslots != NULL)
824 				free(oldslots, M_PRISON);
825 		}
826 		/*
827 		 * Call 'create' method for each existing jail.
828 		 */
829 		psrv->ps_create(psrv, pr);
830 	}
831 	sx_sunlock(&allprison_lock);
832 
833 	return (psrv);
834 }
835 
836 void
837 prison_service_deregister(struct prison_service *psrv)
838 {
839 	struct prison *pr;
840 	void **slots, **oldslots;
841 	int last = 0;
842 
843 	sx_xlock(&allprison_lock);
844 	if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
845 		last = 1;
846 	TAILQ_REMOVE(&prison_services, psrv, ps_next);
847 	prison_service_slots--;
848 	sx_downgrade(&allprison_lock);
849 	LIST_FOREACH(pr, &allprison, pr_list) {
850 		/*
851 		 * Call 'destroy' method for every currently existing jail.
852 		 */
853 		psrv->ps_destroy(psrv, pr);
854 		/*
855 		 * If this is the last slot, free the memory allocated for it.
856 		 */
857 		if (last) {
858 			if (prison_service_slots == 0)
859 				slots = NULL;
860 			else {
861 				slots = malloc(sizeof(*slots) * prison_service_slots,
862 				    M_PRISON, M_WAITOK);
863 			}
864 			mtx_lock(&pr->pr_mtx);
865 			oldslots = pr->pr_slots;
866 			/*
867 			 * We require setting slot to NULL after freeing it,
868 			 * this way we can check for memory leaks here.
869 			 */
870 			KASSERT(oldslots[psrv->ps_slotno] == NULL,
871 			    ("Slot %d (service %s, jailid=%d) still contains data?",
872 			     psrv->ps_slotno, psrv->ps_name, pr->pr_id));
873 			if (psrv->ps_slotno > 0) {
874 				bcopy(oldslots, slots,
875 				    sizeof(*slots) * prison_service_slots);
876 			}
877 			pr->pr_slots = slots;
878 			mtx_unlock(&pr->pr_mtx);
879 			KASSERT(oldslots != NULL, ("oldslots == NULL"));
880 			free(oldslots, M_PRISON);
881 		}
882 	}
883 	sx_sunlock(&allprison_lock);
884 	free(psrv, M_PRISON);
885 }
886 
887 /*
888  * Function sets data for the given jail in slot assigned for the given
889  * jail service.
890  */
891 void
892 prison_service_data_set(struct prison_service *psrv, struct prison *pr,
893     void *data)
894 {
895 
896 	mtx_assert(&pr->pr_mtx, MA_OWNED);
897 	pr->pr_slots[psrv->ps_slotno] = data;
898 }
899 
900 /*
901  * Function clears slots assigned for the given jail service in the given
902  * prison structure and returns current slot data.
903  */
904 void *
905 prison_service_data_del(struct prison_service *psrv, struct prison *pr)
906 {
907 	void *data;
908 
909 	mtx_assert(&pr->pr_mtx, MA_OWNED);
910 	data = pr->pr_slots[psrv->ps_slotno];
911 	pr->pr_slots[psrv->ps_slotno] = NULL;
912 	return (data);
913 }
914 
915 /*
916  * Function returns current data from the slot assigned to the given jail
917  * service for the given jail.
918  */
919 void *
920 prison_service_data_get(struct prison_service *psrv, struct prison *pr)
921 {
922 
923 	mtx_assert(&pr->pr_mtx, MA_OWNED);
924 	return (pr->pr_slots[psrv->ps_slotno]);
925 }
926 
927 static int
928 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
929 {
930 	struct xprison *xp, *sxp;
931 	struct prison *pr;
932 	int count, error;
933 
934 	if (jailed(req->td->td_ucred))
935 		return (0);
936 
937 	sx_slock(&allprison_lock);
938 	if ((count = prisoncount) == 0) {
939 		sx_sunlock(&allprison_lock);
940 		return (0);
941 	}
942 
943 	sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO);
944 
945 	LIST_FOREACH(pr, &allprison, pr_list) {
946 		mtx_lock(&pr->pr_mtx);
947 		xp->pr_version = XPRISON_VERSION;
948 		xp->pr_id = pr->pr_id;
949 		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
950 		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
951 		xp->pr_ip = pr->pr_ip;
952 		mtx_unlock(&pr->pr_mtx);
953 		xp++;
954 	}
955 	sx_sunlock(&allprison_lock);
956 
957 	error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count);
958 	free(sxp, M_TEMP);
959 	return (error);
960 }
961 
962 SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD,
963     NULL, 0, sysctl_jail_list, "S", "List of active jails");
964 
965 static int
966 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
967 {
968 	int error, injail;
969 
970 	injail = jailed(req->td->td_ucred);
971 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
972 
973 	return (error);
974 }
975 SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD,
976     NULL, 0, sysctl_jail_jailed, "I", "Process in jail?");
977