xref: /freebsd/sys/kern/kern_jail.c (revision f7c4bd95ba735bd6a5454b4953945a99cefbb80c)
1 /*-
2  * ----------------------------------------------------------------------------
3  * "THE BEER-WARE LICENSE" (Revision 42):
4  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5  * can do whatever you want with this stuff. If we meet some day, and you think
6  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7  * ----------------------------------------------------------------------------
8  */
9 
10 #include <sys/cdefs.h>
11 __FBSDID("$FreeBSD$");
12 
13 #include "opt_mac.h"
14 
15 #include <sys/param.h>
16 #include <sys/types.h>
17 #include <sys/kernel.h>
18 #include <sys/systm.h>
19 #include <sys/errno.h>
20 #include <sys/sysproto.h>
21 #include <sys/malloc.h>
22 #include <sys/priv.h>
23 #include <sys/proc.h>
24 #include <sys/taskqueue.h>
25 #include <sys/fcntl.h>
26 #include <sys/jail.h>
27 #include <sys/lock.h>
28 #include <sys/mutex.h>
29 #include <sys/sx.h>
30 #include <sys/namei.h>
31 #include <sys/mount.h>
32 #include <sys/queue.h>
33 #include <sys/socket.h>
34 #include <sys/syscallsubr.h>
35 #include <sys/sysctl.h>
36 #include <sys/vnode.h>
37 #include <net/if.h>
38 #include <netinet/in.h>
39 
40 #include <security/mac/mac_framework.h>
41 
42 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
43 
44 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
45     "Jail rules");
46 
47 int	jail_set_hostname_allowed = 1;
48 SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
49     &jail_set_hostname_allowed, 0,
50     "Processes in jail can set their hostnames");
51 
52 int	jail_socket_unixiproute_only = 1;
53 SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
54     &jail_socket_unixiproute_only, 0,
55     "Processes in jail are limited to creating UNIX/IPv4/route sockets only");
56 
57 int	jail_sysvipc_allowed = 0;
58 SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
59     &jail_sysvipc_allowed, 0,
60     "Processes in jail can use System V IPC primitives");
61 
62 static int jail_enforce_statfs = 2;
63 SYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
64     &jail_enforce_statfs, 0,
65     "Processes in jail cannot see all mounted file systems");
66 
67 int	jail_allow_raw_sockets = 0;
68 SYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
69     &jail_allow_raw_sockets, 0,
70     "Prison root can create raw sockets");
71 
72 int	jail_chflags_allowed = 0;
73 SYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
74     &jail_chflags_allowed, 0,
75     "Processes in jail can alter system file flags");
76 
77 int	jail_mount_allowed = 0;
78 SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
79     &jail_mount_allowed, 0,
80     "Processes in jail can mount/unmount jail-friendly file systems");
81 
82 /* allprison, lastprid, and prisoncount are protected by allprison_lock. */
83 struct	prisonlist allprison;
84 struct	sx allprison_lock;
85 int	lastprid = 0;
86 int	prisoncount = 0;
87 
88 /*
89  * List of jail services. Protected by allprison_lock.
90  */
91 TAILQ_HEAD(prison_services_head, prison_service);
92 static struct prison_services_head prison_services =
93     TAILQ_HEAD_INITIALIZER(prison_services);
94 static int prison_service_slots = 0;
95 
96 struct prison_service {
97 	prison_create_t ps_create;
98 	prison_destroy_t ps_destroy;
99 	int		ps_slotno;
100 	TAILQ_ENTRY(prison_service) ps_next;
101 	char	ps_name[0];
102 };
103 
104 static void		 init_prison(void *);
105 static void		 prison_complete(void *context, int pending);
106 static int		 sysctl_jail_list(SYSCTL_HANDLER_ARGS);
107 
108 static void
109 init_prison(void *data __unused)
110 {
111 
112 	sx_init(&allprison_lock, "allprison");
113 	LIST_INIT(&allprison);
114 }
115 
116 SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
117 
118 /*
119  * struct jail_args {
120  *	struct jail *jail;
121  * };
122  */
123 int
124 jail(struct thread *td, struct jail_args *uap)
125 {
126 	struct nameidata nd;
127 	struct prison *pr, *tpr;
128 	struct prison_service *psrv;
129 	struct jail j;
130 	struct jail_attach_args jaa;
131 	int vfslocked, error, tryprid;
132 
133 	error = copyin(uap->jail, &j, sizeof(j));
134 	if (error)
135 		return (error);
136 	if (j.version != 0)
137 		return (EINVAL);
138 
139 	MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
140 	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
141 	pr->pr_ref = 1;
142 	error = copyinstr(j.path, &pr->pr_path, sizeof(pr->pr_path), 0);
143 	if (error)
144 		goto e_killmtx;
145 	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE,
146 	    pr->pr_path, td);
147 	error = namei(&nd);
148 	if (error)
149 		goto e_killmtx;
150 	vfslocked = NDHASGIANT(&nd);
151 	pr->pr_root = nd.ni_vp;
152 	VOP_UNLOCK(nd.ni_vp, 0);
153 	NDFREE(&nd, NDF_ONLY_PNBUF);
154 	VFS_UNLOCK_GIANT(vfslocked);
155 	error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0);
156 	if (error)
157 		goto e_dropvnref;
158 	pr->pr_ip = j.ip_number;
159 	pr->pr_linux = NULL;
160 	pr->pr_securelevel = securelevel;
161 	if (prison_service_slots == 0)
162 		pr->pr_slots = NULL;
163 	else {
164 		pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots,
165 		    M_PRISON, M_ZERO | M_WAITOK);
166 	}
167 
168 	/* Determine next pr_id and add prison to allprison list. */
169 	sx_xlock(&allprison_lock);
170 	tryprid = lastprid + 1;
171 	if (tryprid == JAIL_MAX)
172 		tryprid = 1;
173 next:
174 	LIST_FOREACH(tpr, &allprison, pr_list) {
175 		if (tpr->pr_id == tryprid) {
176 			tryprid++;
177 			if (tryprid == JAIL_MAX) {
178 				sx_xunlock(&allprison_lock);
179 				error = EAGAIN;
180 				goto e_dropvnref;
181 			}
182 			goto next;
183 		}
184 	}
185 	pr->pr_id = jaa.jid = lastprid = tryprid;
186 	LIST_INSERT_HEAD(&allprison, pr, pr_list);
187 	prisoncount++;
188 	sx_downgrade(&allprison_lock);
189 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
190 		psrv->ps_create(psrv, pr);
191 	}
192 	sx_sunlock(&allprison_lock);
193 
194 	error = jail_attach(td, &jaa);
195 	if (error)
196 		goto e_dropprref;
197 	mtx_lock(&pr->pr_mtx);
198 	pr->pr_ref--;
199 	mtx_unlock(&pr->pr_mtx);
200 	td->td_retval[0] = jaa.jid;
201 	return (0);
202 e_dropprref:
203 	sx_xlock(&allprison_lock);
204 	LIST_REMOVE(pr, pr_list);
205 	prisoncount--;
206 	sx_downgrade(&allprison_lock);
207 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
208 		psrv->ps_destroy(psrv, pr);
209 	}
210 	sx_sunlock(&allprison_lock);
211 e_dropvnref:
212 	if (pr->pr_slots != NULL)
213 		FREE(pr->pr_slots, M_PRISON);
214 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
215 	vrele(pr->pr_root);
216 	VFS_UNLOCK_GIANT(vfslocked);
217 e_killmtx:
218 	mtx_destroy(&pr->pr_mtx);
219 	FREE(pr, M_PRISON);
220 	return (error);
221 }
222 
223 /*
224  * struct jail_attach_args {
225  *	int jid;
226  * };
227  */
228 int
229 jail_attach(struct thread *td, struct jail_attach_args *uap)
230 {
231 	struct proc *p;
232 	struct ucred *newcred, *oldcred;
233 	struct prison *pr;
234 	int vfslocked, error;
235 
236 	/*
237 	 * XXX: Note that there is a slight race here if two threads
238 	 * in the same privileged process attempt to attach to two
239 	 * different jails at the same time.  It is important for
240 	 * user processes not to do this, or they might end up with
241 	 * a process root from one prison, but attached to the jail
242 	 * of another.
243 	 */
244 	error = priv_check(td, PRIV_JAIL_ATTACH);
245 	if (error)
246 		return (error);
247 
248 	p = td->td_proc;
249 	sx_slock(&allprison_lock);
250 	pr = prison_find(uap->jid);
251 	if (pr == NULL) {
252 		sx_sunlock(&allprison_lock);
253 		return (EINVAL);
254 	}
255 	pr->pr_ref++;
256 	mtx_unlock(&pr->pr_mtx);
257 	sx_sunlock(&allprison_lock);
258 
259 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
260 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
261 	if ((error = change_dir(pr->pr_root, td)) != 0)
262 		goto e_unlock;
263 #ifdef MAC
264 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
265 		goto e_unlock;
266 #endif
267 	VOP_UNLOCK(pr->pr_root, 0);
268 	change_root(pr->pr_root, td);
269 	VFS_UNLOCK_GIANT(vfslocked);
270 
271 	newcred = crget();
272 	PROC_LOCK(p);
273 	oldcred = p->p_ucred;
274 	setsugid(p);
275 	crcopy(newcred, oldcred);
276 	newcred->cr_prison = pr;
277 	p->p_ucred = newcred;
278 	PROC_UNLOCK(p);
279 	crfree(oldcred);
280 	return (0);
281 e_unlock:
282 	VOP_UNLOCK(pr->pr_root, 0);
283 	VFS_UNLOCK_GIANT(vfslocked);
284 	mtx_lock(&pr->pr_mtx);
285 	pr->pr_ref--;
286 	mtx_unlock(&pr->pr_mtx);
287 	return (error);
288 }
289 
290 /*
291  * Returns a locked prison instance, or NULL on failure.
292  */
293 struct prison *
294 prison_find(int prid)
295 {
296 	struct prison *pr;
297 
298 	sx_assert(&allprison_lock, SX_LOCKED);
299 	LIST_FOREACH(pr, &allprison, pr_list) {
300 		if (pr->pr_id == prid) {
301 			mtx_lock(&pr->pr_mtx);
302 			if (pr->pr_ref == 0) {
303 				mtx_unlock(&pr->pr_mtx);
304 				break;
305 			}
306 			return (pr);
307 		}
308 	}
309 	return (NULL);
310 }
311 
312 void
313 prison_free(struct prison *pr)
314 {
315 
316 	mtx_lock(&pr->pr_mtx);
317 	pr->pr_ref--;
318 	if (pr->pr_ref == 0) {
319 		mtx_unlock(&pr->pr_mtx);
320 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
321 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
322 		return;
323 	}
324 	mtx_unlock(&pr->pr_mtx);
325 }
326 
327 static void
328 prison_complete(void *context, int pending)
329 {
330 	struct prison_service *psrv;
331 	struct prison *pr;
332 	int vfslocked;
333 
334 	pr = (struct prison *)context;
335 
336 	sx_xlock(&allprison_lock);
337 	LIST_REMOVE(pr, pr_list);
338 	prisoncount--;
339 	sx_downgrade(&allprison_lock);
340 	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
341 		psrv->ps_destroy(psrv, pr);
342 	}
343 	sx_sunlock(&allprison_lock);
344 	if (pr->pr_slots != NULL)
345 		FREE(pr->pr_slots, M_PRISON);
346 
347 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
348 	vrele(pr->pr_root);
349 	VFS_UNLOCK_GIANT(vfslocked);
350 
351 	mtx_destroy(&pr->pr_mtx);
352 	if (pr->pr_linux != NULL)
353 		FREE(pr->pr_linux, M_PRISON);
354 	FREE(pr, M_PRISON);
355 }
356 
357 void
358 prison_hold(struct prison *pr)
359 {
360 
361 	mtx_lock(&pr->pr_mtx);
362 	KASSERT(pr->pr_ref > 0,
363 	    ("Trying to hold dead prison (id=%d).", pr->pr_id));
364 	pr->pr_ref++;
365 	mtx_unlock(&pr->pr_mtx);
366 }
367 
368 u_int32_t
369 prison_getip(struct ucred *cred)
370 {
371 
372 	return (cred->cr_prison->pr_ip);
373 }
374 
375 int
376 prison_ip(struct ucred *cred, int flag, u_int32_t *ip)
377 {
378 	u_int32_t tmp;
379 
380 	if (!jailed(cred))
381 		return (0);
382 	if (flag)
383 		tmp = *ip;
384 	else
385 		tmp = ntohl(*ip);
386 	if (tmp == INADDR_ANY) {
387 		if (flag)
388 			*ip = cred->cr_prison->pr_ip;
389 		else
390 			*ip = htonl(cred->cr_prison->pr_ip);
391 		return (0);
392 	}
393 	if (tmp == INADDR_LOOPBACK) {
394 		if (flag)
395 			*ip = cred->cr_prison->pr_ip;
396 		else
397 			*ip = htonl(cred->cr_prison->pr_ip);
398 		return (0);
399 	}
400 	if (cred->cr_prison->pr_ip != tmp)
401 		return (1);
402 	return (0);
403 }
404 
405 void
406 prison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip)
407 {
408 	u_int32_t tmp;
409 
410 	if (!jailed(cred))
411 		return;
412 	if (flag)
413 		tmp = *ip;
414 	else
415 		tmp = ntohl(*ip);
416 	if (tmp == INADDR_LOOPBACK) {
417 		if (flag)
418 			*ip = cred->cr_prison->pr_ip;
419 		else
420 			*ip = htonl(cred->cr_prison->pr_ip);
421 		return;
422 	}
423 	return;
424 }
425 
426 int
427 prison_if(struct ucred *cred, struct sockaddr *sa)
428 {
429 	struct sockaddr_in *sai;
430 	int ok;
431 
432 	sai = (struct sockaddr_in *)sa;
433 	if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
434 		ok = 1;
435 	else if (sai->sin_family != AF_INET)
436 		ok = 0;
437 	else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
438 		ok = 1;
439 	else
440 		ok = 0;
441 	return (ok);
442 }
443 
444 /*
445  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
446  */
447 int
448 prison_check(struct ucred *cred1, struct ucred *cred2)
449 {
450 
451 	if (jailed(cred1)) {
452 		if (!jailed(cred2))
453 			return (ESRCH);
454 		if (cred2->cr_prison != cred1->cr_prison)
455 			return (ESRCH);
456 	}
457 
458 	return (0);
459 }
460 
461 /*
462  * Return 1 if the passed credential is in a jail, otherwise 0.
463  */
464 int
465 jailed(struct ucred *cred)
466 {
467 
468 	return (cred->cr_prison != NULL);
469 }
470 
471 /*
472  * Return the correct hostname for the passed credential.
473  */
474 void
475 getcredhostname(struct ucred *cred, char *buf, size_t size)
476 {
477 
478 	if (jailed(cred)) {
479 		mtx_lock(&cred->cr_prison->pr_mtx);
480 		strlcpy(buf, cred->cr_prison->pr_host, size);
481 		mtx_unlock(&cred->cr_prison->pr_mtx);
482 	} else {
483 		mtx_lock(&hostname_mtx);
484 		strlcpy(buf, hostname, size);
485 		mtx_unlock(&hostname_mtx);
486 	}
487 }
488 
489 /*
490  * Determine whether the subject represented by cred can "see"
491  * status of a mount point.
492  * Returns: 0 for permitted, ENOENT otherwise.
493  * XXX: This function should be called cr_canseemount() and should be
494  *      placed in kern_prot.c.
495  */
496 int
497 prison_canseemount(struct ucred *cred, struct mount *mp)
498 {
499 	struct prison *pr;
500 	struct statfs *sp;
501 	size_t len;
502 
503 	if (!jailed(cred) || jail_enforce_statfs == 0)
504 		return (0);
505 	pr = cred->cr_prison;
506 	if (pr->pr_root->v_mount == mp)
507 		return (0);
508 	if (jail_enforce_statfs == 2)
509 		return (ENOENT);
510 	/*
511 	 * If jail's chroot directory is set to "/" we should be able to see
512 	 * all mount-points from inside a jail.
513 	 * This is ugly check, but this is the only situation when jail's
514 	 * directory ends with '/'.
515 	 */
516 	if (strcmp(pr->pr_path, "/") == 0)
517 		return (0);
518 	len = strlen(pr->pr_path);
519 	sp = &mp->mnt_stat;
520 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
521 		return (ENOENT);
522 	/*
523 	 * Be sure that we don't have situation where jail's root directory
524 	 * is "/some/path" and mount point is "/some/pathpath".
525 	 */
526 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
527 		return (ENOENT);
528 	return (0);
529 }
530 
531 void
532 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
533 {
534 	char jpath[MAXPATHLEN];
535 	struct prison *pr;
536 	size_t len;
537 
538 	if (!jailed(cred) || jail_enforce_statfs == 0)
539 		return;
540 	pr = cred->cr_prison;
541 	if (prison_canseemount(cred, mp) != 0) {
542 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
543 		strlcpy(sp->f_mntonname, "[restricted]",
544 		    sizeof(sp->f_mntonname));
545 		return;
546 	}
547 	if (pr->pr_root->v_mount == mp) {
548 		/*
549 		 * Clear current buffer data, so we are sure nothing from
550 		 * the valid path left there.
551 		 */
552 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
553 		*sp->f_mntonname = '/';
554 		return;
555 	}
556 	/*
557 	 * If jail's chroot directory is set to "/" we should be able to see
558 	 * all mount-points from inside a jail.
559 	 */
560 	if (strcmp(pr->pr_path, "/") == 0)
561 		return;
562 	len = strlen(pr->pr_path);
563 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
564 	/*
565 	 * Clear current buffer data, so we are sure nothing from
566 	 * the valid path left there.
567 	 */
568 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
569 	if (*jpath == '\0') {
570 		/* Should never happen. */
571 		*sp->f_mntonname = '/';
572 	} else {
573 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
574 	}
575 }
576 
577 /*
578  * Check with permission for a specific privilege is granted within jail.  We
579  * have a specific list of accepted privileges; the rest are denied.
580  */
581 int
582 prison_priv_check(struct ucred *cred, int priv)
583 {
584 
585 	if (!jailed(cred))
586 		return (0);
587 
588 	switch (priv) {
589 
590 		/*
591 		 * Allow ktrace privileges for root in jail.
592 		 */
593 	case PRIV_KTRACE:
594 
595 #if 0
596 		/*
597 		 * Allow jailed processes to configure audit identity and
598 		 * submit audit records (login, etc).  In the future we may
599 		 * want to further refine the relationship between audit and
600 		 * jail.
601 		 */
602 	case PRIV_AUDIT_GETAUDIT:
603 	case PRIV_AUDIT_SETAUDIT:
604 	case PRIV_AUDIT_SUBMIT:
605 #endif
606 
607 		/*
608 		 * Allow jailed processes to manipulate process UNIX
609 		 * credentials in any way they see fit.
610 		 */
611 	case PRIV_CRED_SETUID:
612 	case PRIV_CRED_SETEUID:
613 	case PRIV_CRED_SETGID:
614 	case PRIV_CRED_SETEGID:
615 	case PRIV_CRED_SETGROUPS:
616 	case PRIV_CRED_SETREUID:
617 	case PRIV_CRED_SETREGID:
618 	case PRIV_CRED_SETRESUID:
619 	case PRIV_CRED_SETRESGID:
620 
621 		/*
622 		 * Jail implements visibility constraints already, so allow
623 		 * jailed root to override uid/gid-based constraints.
624 		 */
625 	case PRIV_SEEOTHERGIDS:
626 	case PRIV_SEEOTHERUIDS:
627 
628 		/*
629 		 * Jail implements inter-process debugging limits already, so
630 		 * allow jailed root various debugging privileges.
631 		 */
632 	case PRIV_DEBUG_DIFFCRED:
633 	case PRIV_DEBUG_SUGID:
634 	case PRIV_DEBUG_UNPRIV:
635 
636 		/*
637 		 * Allow jail to set various resource limits and login
638 		 * properties, and for now, exceed process resource limits.
639 		 */
640 	case PRIV_PROC_LIMIT:
641 	case PRIV_PROC_SETLOGIN:
642 	case PRIV_PROC_SETRLIMIT:
643 
644 		/*
645 		 * System V and POSIX IPC privileges are granted in jail.
646 		 */
647 	case PRIV_IPC_READ:
648 	case PRIV_IPC_WRITE:
649 	case PRIV_IPC_ADMIN:
650 	case PRIV_IPC_MSGSIZE:
651 	case PRIV_MQ_ADMIN:
652 
653 		/*
654 		 * Jail implements its own inter-process limits, so allow
655 		 * root processes in jail to change scheduling on other
656 		 * processes in the same jail.  Likewise for signalling.
657 		 */
658 	case PRIV_SCHED_DIFFCRED:
659 	case PRIV_SIGNAL_DIFFCRED:
660 	case PRIV_SIGNAL_SUGID:
661 
662 		/*
663 		 * Allow jailed processes to write to sysctls marked as jail
664 		 * writable.
665 		 */
666 	case PRIV_SYSCTL_WRITEJAIL:
667 
668 		/*
669 		 * Allow root in jail to manage a variety of quota
670 		 * properties.  These should likely be conditional on a
671 		 * configuration option.
672 		 */
673 	case PRIV_VFS_GETQUOTA:
674 	case PRIV_VFS_SETQUOTA:
675 
676 		/*
677 		 * Since Jail relies on chroot() to implement file system
678 		 * protections, grant many VFS privileges to root in jail.
679 		 * Be careful to exclude mount-related and NFS-related
680 		 * privileges.
681 		 */
682 	case PRIV_VFS_READ:
683 	case PRIV_VFS_WRITE:
684 	case PRIV_VFS_ADMIN:
685 	case PRIV_VFS_EXEC:
686 	case PRIV_VFS_LOOKUP:
687 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
688 	case PRIV_VFS_CHFLAGS_DEV:
689 	case PRIV_VFS_CHOWN:
690 	case PRIV_VFS_CHROOT:
691 	case PRIV_VFS_RETAINSUGID:
692 	case PRIV_VFS_FCHROOT:
693 	case PRIV_VFS_LINK:
694 	case PRIV_VFS_SETGID:
695 	case PRIV_VFS_STAT:
696 	case PRIV_VFS_STICKYFILE:
697 		return (0);
698 
699 		/*
700 		 * Depending on the global setting, allow privilege of
701 		 * setting system flags.
702 		 */
703 	case PRIV_VFS_SYSFLAGS:
704 		if (jail_chflags_allowed)
705 			return (0);
706 		else
707 			return (EPERM);
708 
709 		/*
710 		 * Depending on the global setting, allow privilege of
711 		 * mounting/unmounting file systems.
712 		 */
713 	case PRIV_VFS_MOUNT:
714 	case PRIV_VFS_UNMOUNT:
715 	case PRIV_VFS_MOUNT_NONUSER:
716 	case PRIV_VFS_MOUNT_OWNER:
717 		if (jail_mount_allowed)
718 			return (0);
719 		else
720 			return (EPERM);
721 
722 		/*
723 		 * Allow jailed root to bind reserved ports and reuse in-use
724 		 * ports.
725 		 */
726 	case PRIV_NETINET_RESERVEDPORT:
727 	case PRIV_NETINET_REUSEPORT:
728 		return (0);
729 
730 		/*
731 		 * Allow jailed root to set certian IPv4/6 (option) headers.
732 		 */
733 	case PRIV_NETINET_SETHDROPTS:
734 		return (0);
735 
736 		/*
737 		 * Conditionally allow creating raw sockets in jail.
738 		 */
739 	case PRIV_NETINET_RAW:
740 		if (jail_allow_raw_sockets)
741 			return (0);
742 		else
743 			return (EPERM);
744 
745 		/*
746 		 * Since jail implements its own visibility limits on netstat
747 		 * sysctls, allow getcred.  This allows identd to work in
748 		 * jail.
749 		 */
750 	case PRIV_NETINET_GETCRED:
751 		return (0);
752 
753 	default:
754 		/*
755 		 * In all remaining cases, deny the privilege request.  This
756 		 * includes almost all network privileges, many system
757 		 * configuration privileges.
758 		 */
759 		return (EPERM);
760 	}
761 }
762 
763 /*
764  * Register jail service. Provides 'create' and 'destroy' methods.
765  * 'create' method will be called for every existing jail and all
766  * jails in the future as they beeing created.
767  * 'destroy' method will be called for every jail going away and
768  * for all existing jails at the time of service deregistration.
769  */
770 struct prison_service *
771 prison_service_register(const char *name, prison_create_t create,
772     prison_destroy_t destroy)
773 {
774 	struct prison_service *psrv, *psrv2;
775 	struct prison *pr;
776 	int reallocate = 1, slotno = 0;
777 	void **slots, **oldslots;
778 
779 	psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
780 	    M_WAITOK | M_ZERO);
781 	psrv->ps_create = create;
782 	psrv->ps_destroy = destroy;
783 	strcpy(psrv->ps_name, name);
784 	/*
785 	 * Grab the allprison_lock here, so we won't miss any jail
786 	 * creation/destruction.
787 	 */
788 	sx_xlock(&allprison_lock);
789 #ifdef INVARIANTS
790 	/*
791 	 * Verify if service is not already registered.
792 	 */
793 	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
794 		KASSERT(strcmp(psrv2->ps_name, name) != 0,
795 		    ("jail service %s already registered", name));
796 	}
797 #endif
798 	/*
799 	 * Find free slot. When there is no existing free slot available,
800 	 * allocate one at the end.
801 	 */
802 	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
803 		if (psrv2->ps_slotno != slotno) {
804 			KASSERT(slotno < psrv2->ps_slotno,
805 			    ("Invalid slotno (slotno=%d >= ps_slotno=%d",
806 			    slotno, psrv2->ps_slotno));
807 			/* We found free slot. */
808 			reallocate = 0;
809 			break;
810 		}
811 		slotno++;
812 	}
813 	psrv->ps_slotno = slotno;
814 	/*
815 	 * Keep the list sorted by slot number.
816 	 */
817 	if (psrv2 != NULL) {
818 		KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
819 		TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
820 	} else {
821 		KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
822 		TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
823 	}
824 	prison_service_slots++;
825 	sx_downgrade(&allprison_lock);
826 	/*
827 	 * Allocate memory for new slot if we didn't found empty one.
828 	 * Do not use realloc(9), because pr_slots is protected with a mutex,
829 	 * so we can't sleep.
830 	 */
831 	LIST_FOREACH(pr, &allprison, pr_list) {
832 		if (reallocate) {
833 			/* First allocate memory with M_WAITOK. */
834 			slots = malloc(sizeof(*slots) * prison_service_slots,
835 			    M_PRISON, M_WAITOK);
836 			/* Now grab the mutex and replace pr_slots. */
837 			mtx_lock(&pr->pr_mtx);
838 			oldslots = pr->pr_slots;
839 			if (psrv->ps_slotno > 0) {
840 				bcopy(oldslots, slots,
841 				    sizeof(*slots) * (prison_service_slots - 1));
842 			}
843 			slots[psrv->ps_slotno] = NULL;
844 			pr->pr_slots = slots;
845 			mtx_unlock(&pr->pr_mtx);
846 			if (oldslots != NULL)
847 				free(oldslots, M_PRISON);
848 		}
849 		/*
850 		 * Call 'create' method for each existing jail.
851 		 */
852 		psrv->ps_create(psrv, pr);
853 	}
854 	sx_sunlock(&allprison_lock);
855 
856 	return (psrv);
857 }
858 
859 void
860 prison_service_deregister(struct prison_service *psrv)
861 {
862 	struct prison *pr;
863 	void **slots, **oldslots;
864 	int last = 0;
865 
866 	sx_xlock(&allprison_lock);
867 	if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
868 		last = 1;
869 	TAILQ_REMOVE(&prison_services, psrv, ps_next);
870 	prison_service_slots--;
871 	sx_downgrade(&allprison_lock);
872 	LIST_FOREACH(pr, &allprison, pr_list) {
873 		/*
874 		 * Call 'destroy' method for every currently existing jail.
875 		 */
876 		psrv->ps_destroy(psrv, pr);
877 		/*
878 		 * If this is the last slot, free the memory allocated for it.
879 		 */
880 		if (last) {
881 			if (prison_service_slots == 0)
882 				slots = NULL;
883 			else {
884 				slots = malloc(sizeof(*slots) * prison_service_slots,
885 				    M_PRISON, M_WAITOK);
886 			}
887 			mtx_lock(&pr->pr_mtx);
888 			oldslots = pr->pr_slots;
889 			/*
890 			 * We require setting slot to NULL after freeing it,
891 			 * this way we can check for memory leaks here.
892 			 */
893 			KASSERT(oldslots[psrv->ps_slotno] == NULL,
894 			    ("Slot %d (service %s, jailid=%d) still contains data?",
895 			     psrv->ps_slotno, psrv->ps_name, pr->pr_id));
896 			if (psrv->ps_slotno > 0) {
897 				bcopy(oldslots, slots,
898 				    sizeof(*slots) * prison_service_slots);
899 			}
900 			pr->pr_slots = slots;
901 			mtx_unlock(&pr->pr_mtx);
902 			KASSERT(oldslots != NULL, ("oldslots == NULL"));
903 			free(oldslots, M_PRISON);
904 		}
905 	}
906 	sx_sunlock(&allprison_lock);
907 	free(psrv, M_PRISON);
908 }
909 
910 /*
911  * Function sets data for the given jail in slot assigned for the given
912  * jail service.
913  */
914 void
915 prison_service_data_set(struct prison_service *psrv, struct prison *pr,
916     void *data)
917 {
918 
919 	mtx_assert(&pr->pr_mtx, MA_OWNED);
920 	pr->pr_slots[psrv->ps_slotno] = data;
921 }
922 
923 /*
924  * Function clears slots assigned for the given jail service in the given
925  * prison structure and returns current slot data.
926  */
927 void *
928 prison_service_data_del(struct prison_service *psrv, struct prison *pr)
929 {
930 	void *data;
931 
932 	mtx_assert(&pr->pr_mtx, MA_OWNED);
933 	data = pr->pr_slots[psrv->ps_slotno];
934 	pr->pr_slots[psrv->ps_slotno] = NULL;
935 	return (data);
936 }
937 
938 /*
939  * Function returns current data from the slot assigned to the given jail
940  * service for the given jail.
941  */
942 void *
943 prison_service_data_get(struct prison_service *psrv, struct prison *pr)
944 {
945 
946 	mtx_assert(&pr->pr_mtx, MA_OWNED);
947 	return (pr->pr_slots[psrv->ps_slotno]);
948 }
949 
950 static int
951 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
952 {
953 	struct xprison *xp, *sxp;
954 	struct prison *pr;
955 	int count, error;
956 
957 	if (jailed(req->td->td_ucred))
958 		return (0);
959 
960 	sx_slock(&allprison_lock);
961 	if ((count = prisoncount) == 0) {
962 		sx_sunlock(&allprison_lock);
963 		return (0);
964 	}
965 
966 	sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO);
967 
968 	LIST_FOREACH(pr, &allprison, pr_list) {
969 		xp->pr_version = XPRISON_VERSION;
970 		xp->pr_id = pr->pr_id;
971 		xp->pr_ip = pr->pr_ip;
972 		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
973 		mtx_lock(&pr->pr_mtx);
974 		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
975 		mtx_unlock(&pr->pr_mtx);
976 		xp++;
977 	}
978 	sx_sunlock(&allprison_lock);
979 
980 	error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count);
981 	free(sxp, M_TEMP);
982 	return (error);
983 }
984 
985 SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD,
986     NULL, 0, sysctl_jail_list, "S", "List of active jails");
987 
988 static int
989 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
990 {
991 	int error, injail;
992 
993 	injail = jailed(req->td->td_ucred);
994 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
995 
996 	return (error);
997 }
998 SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD,
999     NULL, 0, sysctl_jail_jailed, "I", "Process in jail?");
1000