xref: /freebsd/sys/kern/kern_jail.c (revision d876124d6ae9d56da5b4ff4c6015efd1d0c9222a)
1 /*-
2  * ----------------------------------------------------------------------------
3  * "THE BEER-WARE LICENSE" (Revision 42):
4  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5  * can do whatever you want with this stuff. If we meet some day, and you think
6  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7  * ----------------------------------------------------------------------------
8  */
9 
10 #include <sys/cdefs.h>
11 __FBSDID("$FreeBSD$");
12 
13 #include "opt_mac.h"
14 
15 #include <sys/param.h>
16 #include <sys/types.h>
17 #include <sys/kernel.h>
18 #include <sys/systm.h>
19 #include <sys/errno.h>
20 #include <sys/sysproto.h>
21 #include <sys/malloc.h>
22 #include <sys/priv.h>
23 #include <sys/proc.h>
24 #include <sys/taskqueue.h>
25 #include <sys/fcntl.h>
26 #include <sys/jail.h>
27 #include <sys/limits.h>
28 #include <sys/lock.h>
29 #include <sys/mutex.h>
30 #include <sys/sx.h>
31 #include <sys/namei.h>
32 #include <sys/mount.h>
33 #include <sys/queue.h>
34 #include <sys/socket.h>
35 #include <sys/syscallsubr.h>
36 #include <sys/sysctl.h>
37 #include <sys/vnode.h>
38 #include <net/if.h>
39 #include <netinet/in.h>
40 
41 #include <security/mac/mac_framework.h>
42 
43 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
44 
45 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
46     "Jail rules");
47 
48 int	jail_set_hostname_allowed = 1;
49 SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
50     &jail_set_hostname_allowed, 0,
51     "Processes in jail can set their hostnames");
52 
53 int	jail_socket_unixiproute_only = 1;
54 SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
55     &jail_socket_unixiproute_only, 0,
56     "Processes in jail are limited to creating UNIX/IPv4/route sockets only");
57 
58 int	jail_sysvipc_allowed = 0;
59 SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
60     &jail_sysvipc_allowed, 0,
61     "Processes in jail can use System V IPC primitives");
62 
63 static int jail_enforce_statfs = 2;
64 SYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
65     &jail_enforce_statfs, 0,
66     "Processes in jail cannot see all mounted file systems");
67 
68 int	jail_allow_raw_sockets = 0;
69 SYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
70     &jail_allow_raw_sockets, 0,
71     "Prison root can create raw sockets");
72 
73 int	jail_chflags_allowed = 0;
74 SYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
75     &jail_chflags_allowed, 0,
76     "Processes in jail can alter system file flags");
77 
78 int	jail_mount_allowed = 0;
79 SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
80     &jail_mount_allowed, 0,
81     "Processes in jail can mount/unmount jail-friendly file systems");
82 
83 /* allprison and prisoncount are protected by allprison_lock. */
84 struct	sx allprison_lock;
85 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
86 struct	prisonlist allprison = LIST_HEAD_INITIALIZER(allprison);
87 int	prisoncount = 0;
88 /* Prison number allocation */
89 static struct unrhdr *prison_numpool;
90 
91 /*
92  * List of jail services. Protected by allprison_lock.
93  */
94 TAILQ_HEAD(prison_services_head, prison_service);
95 static struct prison_services_head prison_services =
96     TAILQ_HEAD_INITIALIZER(prison_services);
97 static int prison_service_slots = 0;
98 
99 struct prison_service {
100 	prison_create_t ps_create;
101 	prison_destroy_t ps_destroy;
102 	int		ps_slotno;
103 	TAILQ_ENTRY(prison_service) ps_next;
104 	char	ps_name[0];
105 };
106 
107 static void		 init_prison(void *);
108 static void		 prison_complete(void *context, int pending);
109 static int		 sysctl_jail_list(SYSCTL_HANDLER_ARGS);
110 
111 static void
112 init_prison(void *data __unused)
113 {
114 
115 	prison_numpool = new_unrhdr(1, INT_MAX, NULL);
116 }
117 
118 SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
119 
120 /*
121  * struct jail_args {
122  *	struct jail *jail;
123  * };
124  */
125 int
126 jail(struct thread *td, struct jail_args *uap)
127 {
128 	struct nameidata nd;
129 	struct prison *pr;
130 	struct prison_service *psrv;
131 	struct jail j;
132 	struct jail_attach_args jaa;
133 	int vfslocked, error, prid;
134 
135 	error = copyin(uap->jail, &j, sizeof(j));
136 	if (error)
137 		return (error);
138 	if (j.version != 0)
139 		return (EINVAL);
140 
141 	/* Allocate prison number */
142 	prid = alloc_unr(prison_numpool);
143 	if (prid == -1)
144 		return (EAGAIN);
145 
146 	MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
147 	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
148 	pr->pr_ref = 1;
149 	pr->pr_id = jaa.jid = prid;
150 	error = copyinstr(j.path, &pr->pr_path, sizeof(pr->pr_path), 0);
151 	if (error)
152 		goto e_killmtx;
153 	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE,
154 	    pr->pr_path, td);
155 	error = namei(&nd);
156 	if (error)
157 		goto e_killmtx;
158 	vfslocked = NDHASGIANT(&nd);
159 	pr->pr_root = nd.ni_vp;
160 	VOP_UNLOCK(nd.ni_vp, 0);
161 	NDFREE(&nd, NDF_ONLY_PNBUF);
162 	VFS_UNLOCK_GIANT(vfslocked);
163 	error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0);
164 	if (error)
165 		goto e_dropvnref;
166 	pr->pr_ip = j.ip_number;
167 	pr->pr_linux = NULL;
168 	pr->pr_securelevel = securelevel;
169 	if (prison_service_slots == 0)
170 		pr->pr_slots = NULL;
171 	else {
172 		pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots,
173 		    M_PRISON, M_ZERO | M_WAITOK);
174 	}
175 
176 	/* Add prison to allprison list. */
177 	sx_xlock(&allprison_lock);
178 	LIST_INSERT_HEAD(&allprison, pr, pr_list);
179 	prisoncount++;
180 	sx_downgrade(&allprison_lock);
181 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
182 		psrv->ps_create(psrv, pr);
183 	}
184 	sx_sunlock(&allprison_lock);
185 
186 	error = jail_attach(td, &jaa);
187 	if (error)
188 		goto e_dropprref;
189 	mtx_lock(&pr->pr_mtx);
190 	pr->pr_ref--;
191 	mtx_unlock(&pr->pr_mtx);
192 	td->td_retval[0] = jaa.jid;
193 	return (0);
194 e_dropprref:
195 	sx_xlock(&allprison_lock);
196 	LIST_REMOVE(pr, pr_list);
197 	prisoncount--;
198 	sx_downgrade(&allprison_lock);
199 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
200 		psrv->ps_destroy(psrv, pr);
201 	}
202 	sx_sunlock(&allprison_lock);
203 e_dropvnref:
204 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
205 	vrele(pr->pr_root);
206 	VFS_UNLOCK_GIANT(vfslocked);
207 e_killmtx:
208 	mtx_destroy(&pr->pr_mtx);
209 	free_unr(prison_numpool, pr->pr_id);
210 	FREE(pr, M_PRISON);
211 	return (error);
212 }
213 
214 /*
215  * struct jail_attach_args {
216  *	int jid;
217  * };
218  */
219 int
220 jail_attach(struct thread *td, struct jail_attach_args *uap)
221 {
222 	struct proc *p;
223 	struct ucred *newcred, *oldcred;
224 	struct prison *pr;
225 	int vfslocked, error;
226 
227 	/*
228 	 * XXX: Note that there is a slight race here if two threads
229 	 * in the same privileged process attempt to attach to two
230 	 * different jails at the same time.  It is important for
231 	 * user processes not to do this, or they might end up with
232 	 * a process root from one prison, but attached to the jail
233 	 * of another.
234 	 */
235 	error = priv_check(td, PRIV_JAIL_ATTACH);
236 	if (error)
237 		return (error);
238 
239 	p = td->td_proc;
240 	sx_slock(&allprison_lock);
241 	pr = prison_find(uap->jid);
242 	if (pr == NULL) {
243 		sx_sunlock(&allprison_lock);
244 		return (EINVAL);
245 	}
246 	pr->pr_ref++;
247 	mtx_unlock(&pr->pr_mtx);
248 	sx_sunlock(&allprison_lock);
249 
250 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
251 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
252 	if ((error = change_dir(pr->pr_root, td)) != 0)
253 		goto e_unlock;
254 #ifdef MAC
255 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
256 		goto e_unlock;
257 #endif
258 	VOP_UNLOCK(pr->pr_root, 0);
259 	change_root(pr->pr_root, td);
260 	VFS_UNLOCK_GIANT(vfslocked);
261 
262 	newcred = crget();
263 	PROC_LOCK(p);
264 	oldcred = p->p_ucred;
265 	setsugid(p);
266 	crcopy(newcred, oldcred);
267 	newcred->cr_prison = pr;
268 	p->p_ucred = newcred;
269 	PROC_UNLOCK(p);
270 	crfree(oldcred);
271 	return (0);
272 e_unlock:
273 	VOP_UNLOCK(pr->pr_root, 0);
274 	VFS_UNLOCK_GIANT(vfslocked);
275 	mtx_lock(&pr->pr_mtx);
276 	pr->pr_ref--;
277 	mtx_unlock(&pr->pr_mtx);
278 	return (error);
279 }
280 
281 /*
282  * Returns a locked prison instance, or NULL on failure.
283  */
284 struct prison *
285 prison_find(int prid)
286 {
287 	struct prison *pr;
288 
289 	sx_assert(&allprison_lock, SX_LOCKED);
290 	LIST_FOREACH(pr, &allprison, pr_list) {
291 		if (pr->pr_id == prid) {
292 			mtx_lock(&pr->pr_mtx);
293 			if (pr->pr_ref == 0) {
294 				mtx_unlock(&pr->pr_mtx);
295 				break;
296 			}
297 			return (pr);
298 		}
299 	}
300 	return (NULL);
301 }
302 
303 void
304 prison_free(struct prison *pr)
305 {
306 
307 	mtx_lock(&pr->pr_mtx);
308 	pr->pr_ref--;
309 	if (pr->pr_ref == 0) {
310 		mtx_unlock(&pr->pr_mtx);
311 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
312 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
313 		return;
314 	}
315 	mtx_unlock(&pr->pr_mtx);
316 }
317 
318 static void
319 prison_complete(void *context, int pending)
320 {
321 	struct prison_service *psrv;
322 	struct prison *pr;
323 	int vfslocked;
324 
325 	pr = (struct prison *)context;
326 
327 	sx_xlock(&allprison_lock);
328 	LIST_REMOVE(pr, pr_list);
329 	prisoncount--;
330 	sx_downgrade(&allprison_lock);
331 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
332 		psrv->ps_destroy(psrv, pr);
333 	}
334 	sx_sunlock(&allprison_lock);
335 
336 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
337 	vrele(pr->pr_root);
338 	VFS_UNLOCK_GIANT(vfslocked);
339 
340 	mtx_destroy(&pr->pr_mtx);
341 	if (pr->pr_linux != NULL)
342 		FREE(pr->pr_linux, M_PRISON);
343 	free_unr(prison_numpool, pr->pr_id);
344 	FREE(pr, M_PRISON);
345 }
346 
347 void
348 prison_hold(struct prison *pr)
349 {
350 
351 	mtx_lock(&pr->pr_mtx);
352 	KASSERT(pr->pr_ref > 0,
353 	    ("Trying to hold dead prison (id=%d).", pr->pr_id));
354 	pr->pr_ref++;
355 	mtx_unlock(&pr->pr_mtx);
356 }
357 
358 u_int32_t
359 prison_getip(struct ucred *cred)
360 {
361 
362 	return (cred->cr_prison->pr_ip);
363 }
364 
365 int
366 prison_ip(struct ucred *cred, int flag, u_int32_t *ip)
367 {
368 	u_int32_t tmp;
369 
370 	if (!jailed(cred))
371 		return (0);
372 	if (flag)
373 		tmp = *ip;
374 	else
375 		tmp = ntohl(*ip);
376 	if (tmp == INADDR_ANY) {
377 		if (flag)
378 			*ip = cred->cr_prison->pr_ip;
379 		else
380 			*ip = htonl(cred->cr_prison->pr_ip);
381 		return (0);
382 	}
383 	if (tmp == INADDR_LOOPBACK) {
384 		if (flag)
385 			*ip = cred->cr_prison->pr_ip;
386 		else
387 			*ip = htonl(cred->cr_prison->pr_ip);
388 		return (0);
389 	}
390 	if (cred->cr_prison->pr_ip != tmp)
391 		return (1);
392 	return (0);
393 }
394 
395 void
396 prison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip)
397 {
398 	u_int32_t tmp;
399 
400 	if (!jailed(cred))
401 		return;
402 	if (flag)
403 		tmp = *ip;
404 	else
405 		tmp = ntohl(*ip);
406 	if (tmp == INADDR_LOOPBACK) {
407 		if (flag)
408 			*ip = cred->cr_prison->pr_ip;
409 		else
410 			*ip = htonl(cred->cr_prison->pr_ip);
411 		return;
412 	}
413 	return;
414 }
415 
416 int
417 prison_if(struct ucred *cred, struct sockaddr *sa)
418 {
419 	struct sockaddr_in *sai;
420 	int ok;
421 
422 	sai = (struct sockaddr_in *)sa;
423 	if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
424 		ok = 1;
425 	else if (sai->sin_family != AF_INET)
426 		ok = 0;
427 	else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
428 		ok = 1;
429 	else
430 		ok = 0;
431 	return (ok);
432 }
433 
434 /*
435  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
436  */
437 int
438 prison_check(struct ucred *cred1, struct ucred *cred2)
439 {
440 
441 	if (jailed(cred1)) {
442 		if (!jailed(cred2))
443 			return (ESRCH);
444 		if (cred2->cr_prison != cred1->cr_prison)
445 			return (ESRCH);
446 	}
447 
448 	return (0);
449 }
450 
451 /*
452  * Return 1 if the passed credential is in a jail, otherwise 0.
453  */
454 int
455 jailed(struct ucred *cred)
456 {
457 
458 	return (cred->cr_prison != NULL);
459 }
460 
461 /*
462  * Return the correct hostname for the passed credential.
463  */
464 void
465 getcredhostname(struct ucred *cred, char *buf, size_t size)
466 {
467 
468 	if (jailed(cred)) {
469 		mtx_lock(&cred->cr_prison->pr_mtx);
470 		strlcpy(buf, cred->cr_prison->pr_host, size);
471 		mtx_unlock(&cred->cr_prison->pr_mtx);
472 	} else
473 		strlcpy(buf, hostname, size);
474 }
475 
476 /*
477  * Determine whether the subject represented by cred can "see"
478  * status of a mount point.
479  * Returns: 0 for permitted, ENOENT otherwise.
480  * XXX: This function should be called cr_canseemount() and should be
481  *      placed in kern_prot.c.
482  */
483 int
484 prison_canseemount(struct ucred *cred, struct mount *mp)
485 {
486 	struct prison *pr;
487 	struct statfs *sp;
488 	size_t len;
489 
490 	if (!jailed(cred) || jail_enforce_statfs == 0)
491 		return (0);
492 	pr = cred->cr_prison;
493 	if (pr->pr_root->v_mount == mp)
494 		return (0);
495 	if (jail_enforce_statfs == 2)
496 		return (ENOENT);
497 	/*
498 	 * If jail's chroot directory is set to "/" we should be able to see
499 	 * all mount-points from inside a jail.
500 	 * This is ugly check, but this is the only situation when jail's
501 	 * directory ends with '/'.
502 	 */
503 	if (strcmp(pr->pr_path, "/") == 0)
504 		return (0);
505 	len = strlen(pr->pr_path);
506 	sp = &mp->mnt_stat;
507 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
508 		return (ENOENT);
509 	/*
510 	 * Be sure that we don't have situation where jail's root directory
511 	 * is "/some/path" and mount point is "/some/pathpath".
512 	 */
513 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
514 		return (ENOENT);
515 	return (0);
516 }
517 
518 void
519 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
520 {
521 	char jpath[MAXPATHLEN];
522 	struct prison *pr;
523 	size_t len;
524 
525 	if (!jailed(cred) || jail_enforce_statfs == 0)
526 		return;
527 	pr = cred->cr_prison;
528 	if (prison_canseemount(cred, mp) != 0) {
529 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
530 		strlcpy(sp->f_mntonname, "[restricted]",
531 		    sizeof(sp->f_mntonname));
532 		return;
533 	}
534 	if (pr->pr_root->v_mount == mp) {
535 		/*
536 		 * Clear current buffer data, so we are sure nothing from
537 		 * the valid path left there.
538 		 */
539 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
540 		*sp->f_mntonname = '/';
541 		return;
542 	}
543 	/*
544 	 * If jail's chroot directory is set to "/" we should be able to see
545 	 * all mount-points from inside a jail.
546 	 */
547 	if (strcmp(pr->pr_path, "/") == 0)
548 		return;
549 	len = strlen(pr->pr_path);
550 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
551 	/*
552 	 * Clear current buffer data, so we are sure nothing from
553 	 * the valid path left there.
554 	 */
555 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
556 	if (*jpath == '\0') {
557 		/* Should never happen. */
558 		*sp->f_mntonname = '/';
559 	} else {
560 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
561 	}
562 }
563 
564 /*
565  * Check with permission for a specific privilege is granted within jail.  We
566  * have a specific list of accepted privileges; the rest are denied.
567  */
568 int
569 prison_priv_check(struct ucred *cred, int priv)
570 {
571 
572 	if (!jailed(cred))
573 		return (0);
574 
575 	switch (priv) {
576 
577 		/*
578 		 * Allow ktrace privileges for root in jail.
579 		 */
580 	case PRIV_KTRACE:
581 
582 #if 0
583 		/*
584 		 * Allow jailed processes to configure audit identity and
585 		 * submit audit records (login, etc).  In the future we may
586 		 * want to further refine the relationship between audit and
587 		 * jail.
588 		 */
589 	case PRIV_AUDIT_GETAUDIT:
590 	case PRIV_AUDIT_SETAUDIT:
591 	case PRIV_AUDIT_SUBMIT:
592 #endif
593 
594 		/*
595 		 * Allow jailed processes to manipulate process UNIX
596 		 * credentials in any way they see fit.
597 		 */
598 	case PRIV_CRED_SETUID:
599 	case PRIV_CRED_SETEUID:
600 	case PRIV_CRED_SETGID:
601 	case PRIV_CRED_SETEGID:
602 	case PRIV_CRED_SETGROUPS:
603 	case PRIV_CRED_SETREUID:
604 	case PRIV_CRED_SETREGID:
605 	case PRIV_CRED_SETRESUID:
606 	case PRIV_CRED_SETRESGID:
607 
608 		/*
609 		 * Jail implements visibility constraints already, so allow
610 		 * jailed root to override uid/gid-based constraints.
611 		 */
612 	case PRIV_SEEOTHERGIDS:
613 	case PRIV_SEEOTHERUIDS:
614 
615 		/*
616 		 * Jail implements inter-process debugging limits already, so
617 		 * allow jailed root various debugging privileges.
618 		 */
619 	case PRIV_DEBUG_DIFFCRED:
620 	case PRIV_DEBUG_SUGID:
621 	case PRIV_DEBUG_UNPRIV:
622 
623 		/*
624 		 * Allow jail to set various resource limits and login
625 		 * properties, and for now, exceed process resource limits.
626 		 */
627 	case PRIV_PROC_LIMIT:
628 	case PRIV_PROC_SETLOGIN:
629 	case PRIV_PROC_SETRLIMIT:
630 
631 		/*
632 		 * System V and POSIX IPC privileges are granted in jail.
633 		 */
634 	case PRIV_IPC_READ:
635 	case PRIV_IPC_WRITE:
636 	case PRIV_IPC_ADMIN:
637 	case PRIV_IPC_MSGSIZE:
638 	case PRIV_MQ_ADMIN:
639 
640 		/*
641 		 * Jail implements its own inter-process limits, so allow
642 		 * root processes in jail to change scheduling on other
643 		 * processes in the same jail.  Likewise for signalling.
644 		 */
645 	case PRIV_SCHED_DIFFCRED:
646 	case PRIV_SIGNAL_DIFFCRED:
647 	case PRIV_SIGNAL_SUGID:
648 
649 		/*
650 		 * Allow jailed processes to write to sysctls marked as jail
651 		 * writable.
652 		 */
653 	case PRIV_SYSCTL_WRITEJAIL:
654 
655 		/*
656 		 * Allow root in jail to manage a variety of quota
657 		 * properties.  These should likely be conditional on a
658 		 * configuration option.
659 		 */
660 	case PRIV_VFS_GETQUOTA:
661 	case PRIV_VFS_SETQUOTA:
662 
663 		/*
664 		 * Since Jail relies on chroot() to implement file system
665 		 * protections, grant many VFS privileges to root in jail.
666 		 * Be careful to exclude mount-related and NFS-related
667 		 * privileges.
668 		 */
669 	case PRIV_VFS_READ:
670 	case PRIV_VFS_WRITE:
671 	case PRIV_VFS_ADMIN:
672 	case PRIV_VFS_EXEC:
673 	case PRIV_VFS_LOOKUP:
674 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
675 	case PRIV_VFS_CHFLAGS_DEV:
676 	case PRIV_VFS_CHOWN:
677 	case PRIV_VFS_CHROOT:
678 	case PRIV_VFS_RETAINSUGID:
679 	case PRIV_VFS_FCHROOT:
680 	case PRIV_VFS_LINK:
681 	case PRIV_VFS_SETGID:
682 	case PRIV_VFS_STAT:
683 	case PRIV_VFS_STICKYFILE:
684 		return (0);
685 
686 		/*
687 		 * Depending on the global setting, allow privilege of
688 		 * setting system flags.
689 		 */
690 	case PRIV_VFS_SYSFLAGS:
691 		if (jail_chflags_allowed)
692 			return (0);
693 		else
694 			return (EPERM);
695 
696 		/*
697 		 * Depending on the global setting, allow privilege of
698 		 * mounting/unmounting file systems.
699 		 */
700 	case PRIV_VFS_MOUNT:
701 	case PRIV_VFS_UNMOUNT:
702 	case PRIV_VFS_MOUNT_NONUSER:
703 	case PRIV_VFS_MOUNT_OWNER:
704 		if (jail_mount_allowed)
705 			return (0);
706 		else
707 			return (EPERM);
708 
709 		/*
710 		 * Allow jailed root to bind reserved ports and reuse in-use
711 		 * ports.
712 		 */
713 	case PRIV_NETINET_RESERVEDPORT:
714 	case PRIV_NETINET_REUSEPORT:
715 		return (0);
716 
717 		/*
718 		 * Allow jailed root to set certian IPv4/6 (option) headers.
719 		 */
720 	case PRIV_NETINET_SETHDROPTS:
721 		return (0);
722 
723 		/*
724 		 * Conditionally allow creating raw sockets in jail.
725 		 */
726 	case PRIV_NETINET_RAW:
727 		if (jail_allow_raw_sockets)
728 			return (0);
729 		else
730 			return (EPERM);
731 
732 		/*
733 		 * Since jail implements its own visibility limits on netstat
734 		 * sysctls, allow getcred.  This allows identd to work in
735 		 * jail.
736 		 */
737 	case PRIV_NETINET_GETCRED:
738 		return (0);
739 
740 	default:
741 		/*
742 		 * In all remaining cases, deny the privilege request.  This
743 		 * includes almost all network privileges, many system
744 		 * configuration privileges.
745 		 */
746 		return (EPERM);
747 	}
748 }
749 
750 /*
751  * Register jail service. Provides 'create' and 'destroy' methods.
752  * 'create' method will be called for every existing jail and all
753  * jails in the future as they beeing created.
754  * 'destroy' method will be called for every jail going away and
755  * for all existing jails at the time of service deregistration.
756  */
757 struct prison_service *
758 prison_service_register(const char *name, prison_create_t create,
759     prison_destroy_t destroy)
760 {
761 	struct prison_service *psrv, *psrv2;
762 	struct prison *pr;
763 	int reallocate = 1, slotno = 0;
764 	void **slots, **oldslots;
765 
766 	psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
767 	    M_WAITOK | M_ZERO);
768 	psrv->ps_create = create;
769 	psrv->ps_destroy = destroy;
770 	strcpy(psrv->ps_name, name);
771 	/*
772 	 * Grab the allprison_lock here, so we won't miss any jail
773 	 * creation/destruction.
774 	 */
775 	sx_xlock(&allprison_lock);
776 #ifdef INVARIANTS
777 	/*
778 	 * Verify if service is not already registered.
779 	 */
780 	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
781 		KASSERT(strcmp(psrv2->ps_name, name) != 0,
782 		    ("jail service %s already registered", name));
783 	}
784 #endif
785 	/*
786 	 * Find free slot. When there is no existing free slot available,
787 	 * allocate one at the end.
788 	 */
789 	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
790 		if (psrv2->ps_slotno != slotno) {
791 			KASSERT(slotno < psrv2->ps_slotno,
792 			    ("Invalid slotno (slotno=%d >= ps_slotno=%d",
793 			    slotno, psrv2->ps_slotno));
794 			/* We found free slot. */
795 			reallocate = 0;
796 			break;
797 		}
798 		slotno++;
799 	}
800 	psrv->ps_slotno = slotno;
801 	/*
802 	 * Keep the list sorted by slot number.
803 	 */
804 	if (psrv2 != NULL) {
805 		KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
806 		TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
807 	} else {
808 		KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
809 		TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
810 	}
811 	prison_service_slots++;
812 	sx_downgrade(&allprison_lock);
813 	/*
814 	 * Allocate memory for new slot if we didn't found empty one.
815 	 * Do not use realloc(9), because pr_slots is protected with a mutex,
816 	 * so we can't sleep.
817 	 */
818 	LIST_FOREACH(pr, &allprison, pr_list) {
819 		if (reallocate) {
820 			/* First allocate memory with M_WAITOK. */
821 			slots = malloc(sizeof(*slots) * prison_service_slots,
822 			    M_PRISON, M_WAITOK);
823 			/* Now grab the mutex and replace pr_slots. */
824 			mtx_lock(&pr->pr_mtx);
825 			oldslots = pr->pr_slots;
826 			if (psrv->ps_slotno > 0) {
827 				bcopy(oldslots, slots,
828 				    sizeof(*slots) * (prison_service_slots - 1));
829 			}
830 			slots[psrv->ps_slotno] = NULL;
831 			pr->pr_slots = slots;
832 			mtx_unlock(&pr->pr_mtx);
833 			if (oldslots != NULL)
834 				free(oldslots, M_PRISON);
835 		}
836 		/*
837 		 * Call 'create' method for each existing jail.
838 		 */
839 		psrv->ps_create(psrv, pr);
840 	}
841 	sx_sunlock(&allprison_lock);
842 
843 	return (psrv);
844 }
845 
846 void
847 prison_service_deregister(struct prison_service *psrv)
848 {
849 	struct prison *pr;
850 	void **slots, **oldslots;
851 	int last = 0;
852 
853 	sx_xlock(&allprison_lock);
854 	if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
855 		last = 1;
856 	TAILQ_REMOVE(&prison_services, psrv, ps_next);
857 	prison_service_slots--;
858 	sx_downgrade(&allprison_lock);
859 	LIST_FOREACH(pr, &allprison, pr_list) {
860 		/*
861 		 * Call 'destroy' method for every currently existing jail.
862 		 */
863 		psrv->ps_destroy(psrv, pr);
864 		/*
865 		 * If this is the last slot, free the memory allocated for it.
866 		 */
867 		if (last) {
868 			if (prison_service_slots == 0)
869 				slots = NULL;
870 			else {
871 				slots = malloc(sizeof(*slots) * prison_service_slots,
872 				    M_PRISON, M_WAITOK);
873 			}
874 			mtx_lock(&pr->pr_mtx);
875 			oldslots = pr->pr_slots;
876 			/*
877 			 * We require setting slot to NULL after freeing it,
878 			 * this way we can check for memory leaks here.
879 			 */
880 			KASSERT(oldslots[psrv->ps_slotno] == NULL,
881 			    ("Slot %d (service %s, jailid=%d) still contains data?",
882 			     psrv->ps_slotno, psrv->ps_name, pr->pr_id));
883 			if (psrv->ps_slotno > 0) {
884 				bcopy(oldslots, slots,
885 				    sizeof(*slots) * prison_service_slots);
886 			}
887 			pr->pr_slots = slots;
888 			mtx_unlock(&pr->pr_mtx);
889 			KASSERT(oldslots != NULL, ("oldslots == NULL"));
890 			free(oldslots, M_PRISON);
891 		}
892 	}
893 	sx_sunlock(&allprison_lock);
894 	free(psrv, M_PRISON);
895 }
896 
897 /*
898  * Function sets data for the given jail in slot assigned for the given
899  * jail service.
900  */
901 void
902 prison_service_data_set(struct prison_service *psrv, struct prison *pr,
903     void *data)
904 {
905 
906 	mtx_assert(&pr->pr_mtx, MA_OWNED);
907 	pr->pr_slots[psrv->ps_slotno] = data;
908 }
909 
910 /*
911  * Function clears slots assigned for the given jail service in the given
912  * prison structure and returns current slot data.
913  */
914 void *
915 prison_service_data_del(struct prison_service *psrv, struct prison *pr)
916 {
917 	void *data;
918 
919 	mtx_assert(&pr->pr_mtx, MA_OWNED);
920 	data = pr->pr_slots[psrv->ps_slotno];
921 	pr->pr_slots[psrv->ps_slotno] = NULL;
922 	return (data);
923 }
924 
925 /*
926  * Function returns current data from the slot assigned to the given jail
927  * service for the given jail.
928  */
929 void *
930 prison_service_data_get(struct prison_service *psrv, struct prison *pr)
931 {
932 
933 	mtx_assert(&pr->pr_mtx, MA_OWNED);
934 	return (pr->pr_slots[psrv->ps_slotno]);
935 }
936 
937 static int
938 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
939 {
940 	struct xprison *xp, *sxp;
941 	struct prison *pr;
942 	int count, error;
943 
944 	if (jailed(req->td->td_ucred))
945 		return (0);
946 
947 	sx_slock(&allprison_lock);
948 	if ((count = prisoncount) == 0) {
949 		sx_sunlock(&allprison_lock);
950 		return (0);
951 	}
952 
953 	sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO);
954 
955 	LIST_FOREACH(pr, &allprison, pr_list) {
956 		xp->pr_version = XPRISON_VERSION;
957 		xp->pr_id = pr->pr_id;
958 		xp->pr_ip = pr->pr_ip;
959 		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
960 		mtx_lock(&pr->pr_mtx);
961 		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
962 		mtx_unlock(&pr->pr_mtx);
963 		xp++;
964 	}
965 	sx_sunlock(&allprison_lock);
966 
967 	error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count);
968 	free(sxp, M_TEMP);
969 	return (error);
970 }
971 
972 SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD,
973     NULL, 0, sysctl_jail_list, "S", "List of active jails");
974 
975 static int
976 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
977 {
978 	int error, injail;
979 
980 	injail = jailed(req->td->td_ucred);
981 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
982 
983 	return (error);
984 }
985 SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD,
986     NULL, 0, sysctl_jail_jailed, "I", "Process in jail?");
987