xref: /freebsd/sys/kern/uipc_mqueue.c (revision d9f0ce31900a48d1a2bfc1c8c86f79d1e831451a)
1 /*-
2  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  */
27 
28 /*
29  * POSIX message queue implementation.
30  *
31  * 1) A mqueue filesystem can be mounted, each message queue appears
32  *    in mounted directory, user can change queue's permission and
33  *    ownership, or remove a queue. Manually creating a file in the
34  *    directory causes a message queue to be created in the kernel with
35  *    default message queue attributes applied and same name used, this
36  *    method is not advocated since mq_open syscall allows user to specify
37  *    different attributes. Also the file system can be mounted multiple
38  *    times at different mount points but shows same contents.
39  *
40  * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
41  *    but directly operate on internal data structure, this allows user to
42  *    use the IPC facility without having to mount mqueue file system.
43  */
44 
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47 
48 #include "opt_capsicum.h"
49 #include "opt_compat.h"
50 
51 #include <sys/param.h>
52 #include <sys/kernel.h>
53 #include <sys/systm.h>
54 #include <sys/limits.h>
55 #include <sys/malloc.h>
56 #include <sys/buf.h>
57 #include <sys/capsicum.h>
58 #include <sys/dirent.h>
59 #include <sys/event.h>
60 #include <sys/eventhandler.h>
61 #include <sys/fcntl.h>
62 #include <sys/file.h>
63 #include <sys/filedesc.h>
64 #include <sys/jail.h>
65 #include <sys/lock.h>
66 #include <sys/module.h>
67 #include <sys/mount.h>
68 #include <sys/mqueue.h>
69 #include <sys/mutex.h>
70 #include <sys/namei.h>
71 #include <sys/posix4.h>
72 #include <sys/poll.h>
73 #include <sys/priv.h>
74 #include <sys/proc.h>
75 #include <sys/queue.h>
76 #include <sys/sysproto.h>
77 #include <sys/stat.h>
78 #include <sys/syscall.h>
79 #include <sys/syscallsubr.h>
80 #include <sys/sysent.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/unistd.h>
85 #include <sys/user.h>
86 #include <sys/vnode.h>
87 #include <machine/atomic.h>
88 
89 FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
90 
91 /*
92  * Limits and constants
93  */
94 #define	MQFS_NAMELEN		NAME_MAX
95 #define MQFS_DELEN		(8 + MQFS_NAMELEN)
96 
97 /* node types */
98 typedef enum {
99 	mqfstype_none = 0,
100 	mqfstype_root,
101 	mqfstype_dir,
102 	mqfstype_this,
103 	mqfstype_parent,
104 	mqfstype_file,
105 	mqfstype_symlink,
106 } mqfs_type_t;
107 
108 struct mqfs_node;
109 
110 /*
111  * mqfs_info: describes a mqfs instance
112  */
113 struct mqfs_info {
114 	struct sx		mi_lock;
115 	struct mqfs_node	*mi_root;
116 	struct unrhdr		*mi_unrhdr;
117 };
118 
119 struct mqfs_vdata {
120 	LIST_ENTRY(mqfs_vdata)	mv_link;
121 	struct mqfs_node	*mv_node;
122 	struct vnode		*mv_vnode;
123 	struct task		mv_task;
124 };
125 
126 /*
127  * mqfs_node: describes a node (file or directory) within a mqfs
128  */
129 struct mqfs_node {
130 	char			mn_name[MQFS_NAMELEN+1];
131 	struct mqfs_info	*mn_info;
132 	struct mqfs_node	*mn_parent;
133 	LIST_HEAD(,mqfs_node)	mn_children;
134 	LIST_ENTRY(mqfs_node)	mn_sibling;
135 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
136 	const void		*mn_pr_root;
137 	int			mn_refcount;
138 	mqfs_type_t		mn_type;
139 	int			mn_deleted;
140 	uint32_t		mn_fileno;
141 	void			*mn_data;
142 	struct timespec		mn_birth;
143 	struct timespec		mn_ctime;
144 	struct timespec		mn_atime;
145 	struct timespec		mn_mtime;
146 	uid_t			mn_uid;
147 	gid_t			mn_gid;
148 	int			mn_mode;
149 };
150 
151 #define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
152 #define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
153 #define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
154 #define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
155 				(fp)->f_data)->mn_data))
156 
157 struct mqfs_osd {
158 	struct task	mo_task;
159 	const void	*mo_pr_root;
160 };
161 
162 TAILQ_HEAD(msgq, mqueue_msg);
163 
164 struct mqueue;
165 
166 struct mqueue_notifier {
167 	LIST_ENTRY(mqueue_notifier)	nt_link;
168 	struct sigevent			nt_sigev;
169 	ksiginfo_t			nt_ksi;
170 	struct proc			*nt_proc;
171 };
172 
173 struct mqueue {
174 	struct mtx	mq_mutex;
175 	int		mq_flags;
176 	long		mq_maxmsg;
177 	long		mq_msgsize;
178 	long		mq_curmsgs;
179 	long		mq_totalbytes;
180 	struct msgq	mq_msgq;
181 	int		mq_receivers;
182 	int		mq_senders;
183 	struct selinfo	mq_rsel;
184 	struct selinfo	mq_wsel;
185 	struct mqueue_notifier	*mq_notifier;
186 };
187 
188 #define	MQ_RSEL		0x01
189 #define	MQ_WSEL		0x02
190 
191 struct mqueue_msg {
192 	TAILQ_ENTRY(mqueue_msg)	msg_link;
193 	unsigned int	msg_prio;
194 	unsigned int	msg_size;
195 	/* following real data... */
196 };
197 
198 static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
199 	"POSIX real time message queue");
200 
201 static int	default_maxmsg  = 10;
202 static int	default_msgsize = 1024;
203 
204 static int	maxmsg = 100;
205 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
206     &maxmsg, 0, "Default maximum messages in queue");
207 static int	maxmsgsize = 16384;
208 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
209     &maxmsgsize, 0, "Default maximum message size");
210 static int	maxmq = 100;
211 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
212     &maxmq, 0, "maximum message queues");
213 static int	curmq = 0;
214 SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
215     &curmq, 0, "current message queue number");
216 static int	unloadable = 0;
217 static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
218 
219 static eventhandler_tag exit_tag;
220 
221 /* Only one instance per-system */
222 static struct mqfs_info		mqfs_data;
223 static uma_zone_t		mqnode_zone;
224 static uma_zone_t		mqueue_zone;
225 static uma_zone_t		mvdata_zone;
226 static uma_zone_t		mqnoti_zone;
227 static struct vop_vector	mqfs_vnodeops;
228 static struct fileops		mqueueops;
229 static unsigned			mqfs_osd_jail_slot;
230 
231 /*
232  * Directory structure construction and manipulation
233  */
234 #ifdef notyet
235 static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
236 	const char *name, int namelen, struct ucred *cred, int mode);
237 static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
238 	const char *name, int namelen, struct ucred *cred, int mode);
239 #endif
240 
241 static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
242 	const char *name, int namelen, struct ucred *cred, int mode);
243 static int	mqfs_destroy(struct mqfs_node *mn);
244 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
245 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
246 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
247 static int	mqfs_prison_create(void *obj, void *data);
248 static void	mqfs_prison_destructor(void *data);
249 static void	mqfs_prison_remove_task(void *context, int pending);
250 
251 /*
252  * Message queue construction and maniplation
253  */
254 static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
255 static void	mqueue_free(struct mqueue *mq);
256 static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
257 			size_t msg_len, unsigned msg_prio, int waitok,
258 			const struct timespec *abs_timeout);
259 static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
260 			size_t msg_len, unsigned *msg_prio, int waitok,
261 			const struct timespec *abs_timeout);
262 static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
263 			int timo);
264 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
265 			int timo);
266 static void	mqueue_send_notification(struct mqueue *mq);
267 static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
268 static void	mq_proc_exit(void *arg, struct proc *p);
269 
270 /*
271  * kqueue filters
272  */
273 static void	filt_mqdetach(struct knote *kn);
274 static int	filt_mqread(struct knote *kn, long hint);
275 static int	filt_mqwrite(struct knote *kn, long hint);
276 
277 struct filterops mq_rfiltops = {
278 	.f_isfd = 1,
279 	.f_detach = filt_mqdetach,
280 	.f_event = filt_mqread,
281 };
282 struct filterops mq_wfiltops = {
283 	.f_isfd = 1,
284 	.f_detach = filt_mqdetach,
285 	.f_event = filt_mqwrite,
286 };
287 
288 /*
289  * Initialize fileno bitmap
290  */
291 static void
292 mqfs_fileno_init(struct mqfs_info *mi)
293 {
294 	struct unrhdr *up;
295 
296 	up = new_unrhdr(1, INT_MAX, NULL);
297 	mi->mi_unrhdr = up;
298 }
299 
300 /*
301  * Tear down fileno bitmap
302  */
303 static void
304 mqfs_fileno_uninit(struct mqfs_info *mi)
305 {
306 	struct unrhdr *up;
307 
308 	up = mi->mi_unrhdr;
309 	mi->mi_unrhdr = NULL;
310 	delete_unrhdr(up);
311 }
312 
313 /*
314  * Allocate a file number
315  */
316 static void
317 mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
318 {
319 	/* make sure our parent has a file number */
320 	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
321 		mqfs_fileno_alloc(mi, mn->mn_parent);
322 
323 	switch (mn->mn_type) {
324 	case mqfstype_root:
325 	case mqfstype_dir:
326 	case mqfstype_file:
327 	case mqfstype_symlink:
328 		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
329 		break;
330 	case mqfstype_this:
331 		KASSERT(mn->mn_parent != NULL,
332 		    ("mqfstype_this node has no parent"));
333 		mn->mn_fileno = mn->mn_parent->mn_fileno;
334 		break;
335 	case mqfstype_parent:
336 		KASSERT(mn->mn_parent != NULL,
337 		    ("mqfstype_parent node has no parent"));
338 		if (mn->mn_parent == mi->mi_root) {
339 			mn->mn_fileno = mn->mn_parent->mn_fileno;
340 			break;
341 		}
342 		KASSERT(mn->mn_parent->mn_parent != NULL,
343 		    ("mqfstype_parent node has no grandparent"));
344 		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
345 		break;
346 	default:
347 		KASSERT(0,
348 		    ("mqfs_fileno_alloc() called for unknown type node: %d",
349 			mn->mn_type));
350 		break;
351 	}
352 }
353 
354 /*
355  * Release a file number
356  */
357 static void
358 mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
359 {
360 	switch (mn->mn_type) {
361 	case mqfstype_root:
362 	case mqfstype_dir:
363 	case mqfstype_file:
364 	case mqfstype_symlink:
365 		free_unr(mi->mi_unrhdr, mn->mn_fileno);
366 		break;
367 	case mqfstype_this:
368 	case mqfstype_parent:
369 		/* ignore these, as they don't "own" their file number */
370 		break;
371 	default:
372 		KASSERT(0,
373 		    ("mqfs_fileno_free() called for unknown type node: %d",
374 			mn->mn_type));
375 		break;
376 	}
377 }
378 
379 static __inline struct mqfs_node *
380 mqnode_alloc(void)
381 {
382 	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
383 }
384 
385 static __inline void
386 mqnode_free(struct mqfs_node *node)
387 {
388 	uma_zfree(mqnode_zone, node);
389 }
390 
391 static __inline void
392 mqnode_addref(struct mqfs_node *node)
393 {
394 	atomic_fetchadd_int(&node->mn_refcount, 1);
395 }
396 
397 static __inline void
398 mqnode_release(struct mqfs_node *node)
399 {
400 	struct mqfs_info *mqfs;
401 	int old, exp;
402 
403 	mqfs = node->mn_info;
404 	old = atomic_fetchadd_int(&node->mn_refcount, -1);
405 	if (node->mn_type == mqfstype_dir ||
406 	    node->mn_type == mqfstype_root)
407 		exp = 3; /* include . and .. */
408 	else
409 		exp = 1;
410 	if (old == exp) {
411 		int locked = sx_xlocked(&mqfs->mi_lock);
412 		if (!locked)
413 			sx_xlock(&mqfs->mi_lock);
414 		mqfs_destroy(node);
415 		if (!locked)
416 			sx_xunlock(&mqfs->mi_lock);
417 	}
418 }
419 
420 /*
421  * Add a node to a directory
422  */
423 static int
424 mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
425 {
426 	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
427 	KASSERT(parent->mn_info != NULL,
428 	    ("%s(): parent has no mn_info", __func__));
429 	KASSERT(parent->mn_type == mqfstype_dir ||
430 	    parent->mn_type == mqfstype_root,
431 	    ("%s(): parent is not a directory", __func__));
432 
433 	node->mn_info = parent->mn_info;
434 	node->mn_parent = parent;
435 	LIST_INIT(&node->mn_children);
436 	LIST_INIT(&node->mn_vnodes);
437 	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
438 	mqnode_addref(parent);
439 	return (0);
440 }
441 
442 static struct mqfs_node *
443 mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
444 	int nodetype)
445 {
446 	struct mqfs_node *node;
447 
448 	node = mqnode_alloc();
449 	strncpy(node->mn_name, name, namelen);
450 	node->mn_pr_root = cred->cr_prison->pr_root;
451 	node->mn_type = nodetype;
452 	node->mn_refcount = 1;
453 	vfs_timestamp(&node->mn_birth);
454 	node->mn_ctime = node->mn_atime = node->mn_mtime
455 		= node->mn_birth;
456 	node->mn_uid = cred->cr_uid;
457 	node->mn_gid = cred->cr_gid;
458 	node->mn_mode = mode;
459 	return (node);
460 }
461 
462 /*
463  * Create a file
464  */
465 static struct mqfs_node *
466 mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
467 	struct ucred *cred, int mode)
468 {
469 	struct mqfs_node *node;
470 
471 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
472 	if (mqfs_add_node(parent, node) != 0) {
473 		mqnode_free(node);
474 		return (NULL);
475 	}
476 	return (node);
477 }
478 
479 /*
480  * Add . and .. to a directory
481  */
482 static int
483 mqfs_fixup_dir(struct mqfs_node *parent)
484 {
485 	struct mqfs_node *dir;
486 
487 	dir = mqnode_alloc();
488 	dir->mn_name[0] = '.';
489 	dir->mn_type = mqfstype_this;
490 	dir->mn_refcount = 1;
491 	if (mqfs_add_node(parent, dir) != 0) {
492 		mqnode_free(dir);
493 		return (-1);
494 	}
495 
496 	dir = mqnode_alloc();
497 	dir->mn_name[0] = dir->mn_name[1] = '.';
498 	dir->mn_type = mqfstype_parent;
499 	dir->mn_refcount = 1;
500 
501 	if (mqfs_add_node(parent, dir) != 0) {
502 		mqnode_free(dir);
503 		return (-1);
504 	}
505 
506 	return (0);
507 }
508 
509 #ifdef notyet
510 
511 /*
512  * Create a directory
513  */
514 static struct mqfs_node *
515 mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
516 	struct ucred *cred, int mode)
517 {
518 	struct mqfs_node *node;
519 
520 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
521 	if (mqfs_add_node(parent, node) != 0) {
522 		mqnode_free(node);
523 		return (NULL);
524 	}
525 
526 	if (mqfs_fixup_dir(node) != 0) {
527 		mqfs_destroy(node);
528 		return (NULL);
529 	}
530 	return (node);
531 }
532 
533 /*
534  * Create a symlink
535  */
536 static struct mqfs_node *
537 mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
538 	struct ucred *cred, int mode)
539 {
540 	struct mqfs_node *node;
541 
542 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
543 	if (mqfs_add_node(parent, node) != 0) {
544 		mqnode_free(node);
545 		return (NULL);
546 	}
547 	return (node);
548 }
549 
550 #endif
551 
552 /*
553  * Destroy a node or a tree of nodes
554  */
555 static int
556 mqfs_destroy(struct mqfs_node *node)
557 {
558 	struct mqfs_node *parent;
559 
560 	KASSERT(node != NULL,
561 	    ("%s(): node is NULL", __func__));
562 	KASSERT(node->mn_info != NULL,
563 	    ("%s(): node has no mn_info", __func__));
564 
565 	/* destroy children */
566 	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
567 		while (! LIST_EMPTY(&node->mn_children))
568 			mqfs_destroy(LIST_FIRST(&node->mn_children));
569 
570 	/* unlink from parent */
571 	if ((parent = node->mn_parent) != NULL) {
572 		KASSERT(parent->mn_info == node->mn_info,
573 		    ("%s(): parent has different mn_info", __func__));
574 		LIST_REMOVE(node, mn_sibling);
575 	}
576 
577 	if (node->mn_fileno != 0)
578 		mqfs_fileno_free(node->mn_info, node);
579 	if (node->mn_data != NULL)
580 		mqueue_free(node->mn_data);
581 	mqnode_free(node);
582 	return (0);
583 }
584 
585 /*
586  * Mount a mqfs instance
587  */
588 static int
589 mqfs_mount(struct mount *mp)
590 {
591 	struct statfs *sbp;
592 
593 	if (mp->mnt_flag & MNT_UPDATE)
594 		return (EOPNOTSUPP);
595 
596 	mp->mnt_data = &mqfs_data;
597 	MNT_ILOCK(mp);
598 	mp->mnt_flag |= MNT_LOCAL;
599 	MNT_IUNLOCK(mp);
600 	vfs_getnewfsid(mp);
601 
602 	sbp = &mp->mnt_stat;
603 	vfs_mountedfrom(mp, "mqueue");
604 	sbp->f_bsize = PAGE_SIZE;
605 	sbp->f_iosize = PAGE_SIZE;
606 	sbp->f_blocks = 1;
607 	sbp->f_bfree = 0;
608 	sbp->f_bavail = 0;
609 	sbp->f_files = 1;
610 	sbp->f_ffree = 0;
611 	return (0);
612 }
613 
614 /*
615  * Unmount a mqfs instance
616  */
617 static int
618 mqfs_unmount(struct mount *mp, int mntflags)
619 {
620 	int error;
621 
622 	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0,
623 	    curthread);
624 	return (error);
625 }
626 
627 /*
628  * Return a root vnode
629  */
630 static int
631 mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
632 {
633 	struct mqfs_info *mqfs;
634 	int ret;
635 
636 	mqfs = VFSTOMQFS(mp);
637 	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
638 	return (ret);
639 }
640 
641 /*
642  * Return filesystem stats
643  */
644 static int
645 mqfs_statfs(struct mount *mp, struct statfs *sbp)
646 {
647 	/* XXX update statistics */
648 	return (0);
649 }
650 
651 /*
652  * Initialize a mqfs instance
653  */
654 static int
655 mqfs_init(struct vfsconf *vfc)
656 {
657 	struct mqfs_node *root;
658 	struct mqfs_info *mi;
659 	struct prison *pr;
660 	osd_method_t methods[PR_MAXMETHOD] = {
661 	    [PR_METHOD_CREATE] = mqfs_prison_create,
662 	};
663 
664 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
665 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
666 	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
667 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
668 	mvdata_zone = uma_zcreate("mvdata",
669 		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
670 		NULL, UMA_ALIGN_PTR, 0);
671 	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
672 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
673 	mi = &mqfs_data;
674 	sx_init(&mi->mi_lock, "mqfs lock");
675 	/* set up the root diretory */
676 	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
677 		mqfstype_root);
678 	root->mn_info = mi;
679 	LIST_INIT(&root->mn_children);
680 	LIST_INIT(&root->mn_vnodes);
681 	mi->mi_root = root;
682 	mqfs_fileno_init(mi);
683 	mqfs_fileno_alloc(mi, root);
684 	mqfs_fixup_dir(root);
685 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
686 	    EVENTHANDLER_PRI_ANY);
687 	mq_fdclose = mqueue_fdclose;
688 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
689 
690 	/* Note current jails. */
691 	mqfs_osd_jail_slot = osd_jail_register(mqfs_prison_destructor, methods);
692 	sx_slock(&allprison_lock);
693 	TAILQ_FOREACH(pr, &allprison, pr_list)
694 		(void)mqfs_prison_create(pr, NULL);
695 	sx_sunlock(&allprison_lock);
696 	return (0);
697 }
698 
699 /*
700  * Destroy a mqfs instance
701  */
702 static int
703 mqfs_uninit(struct vfsconf *vfc)
704 {
705 	unsigned slot;
706 	struct mqfs_info *mi;
707 
708 	if (!unloadable)
709 		return (EOPNOTSUPP);
710 	slot = mqfs_osd_jail_slot;
711 	mqfs_osd_jail_slot = 0;
712 	osd_jail_deregister(slot);
713 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
714 	mi = &mqfs_data;
715 	mqfs_destroy(mi->mi_root);
716 	mi->mi_root = NULL;
717 	mqfs_fileno_uninit(mi);
718 	sx_destroy(&mi->mi_lock);
719 	uma_zdestroy(mqnode_zone);
720 	uma_zdestroy(mqueue_zone);
721 	uma_zdestroy(mvdata_zone);
722 	uma_zdestroy(mqnoti_zone);
723 	return (0);
724 }
725 
726 /*
727  * task routine
728  */
729 static void
730 do_recycle(void *context, int pending __unused)
731 {
732 	struct vnode *vp = (struct vnode *)context;
733 
734 	vrecycle(vp);
735 	vdrop(vp);
736 }
737 
738 /*
739  * Allocate a vnode
740  */
741 static int
742 mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
743 {
744 	struct mqfs_vdata *vd;
745 	struct mqfs_info  *mqfs;
746 	struct vnode *newvpp;
747 	int error;
748 
749 	mqfs = pn->mn_info;
750 	*vpp = NULL;
751 	sx_xlock(&mqfs->mi_lock);
752 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
753 		if (vd->mv_vnode->v_mount == mp) {
754 			vhold(vd->mv_vnode);
755 			break;
756 		}
757 	}
758 
759 	if (vd != NULL) {
760 found:
761 		*vpp = vd->mv_vnode;
762 		sx_xunlock(&mqfs->mi_lock);
763 		error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread);
764 		vdrop(*vpp);
765 		return (error);
766 	}
767 	sx_xunlock(&mqfs->mi_lock);
768 
769 	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
770 	if (error)
771 		return (error);
772 	vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
773 	error = insmntque(newvpp, mp);
774 	if (error != 0)
775 		return (error);
776 
777 	sx_xlock(&mqfs->mi_lock);
778 	/*
779 	 * Check if it has already been allocated
780 	 * while we were blocked.
781 	 */
782 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
783 		if (vd->mv_vnode->v_mount == mp) {
784 			vhold(vd->mv_vnode);
785 			sx_xunlock(&mqfs->mi_lock);
786 
787 			vgone(newvpp);
788 			vput(newvpp);
789 			goto found;
790 		}
791 	}
792 
793 	*vpp = newvpp;
794 
795 	vd = uma_zalloc(mvdata_zone, M_WAITOK);
796 	(*vpp)->v_data = vd;
797 	vd->mv_vnode = *vpp;
798 	vd->mv_node = pn;
799 	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
800 	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
801 	mqnode_addref(pn);
802 	switch (pn->mn_type) {
803 	case mqfstype_root:
804 		(*vpp)->v_vflag = VV_ROOT;
805 		/* fall through */
806 	case mqfstype_dir:
807 	case mqfstype_this:
808 	case mqfstype_parent:
809 		(*vpp)->v_type = VDIR;
810 		break;
811 	case mqfstype_file:
812 		(*vpp)->v_type = VREG;
813 		break;
814 	case mqfstype_symlink:
815 		(*vpp)->v_type = VLNK;
816 		break;
817 	case mqfstype_none:
818 		KASSERT(0, ("mqfs_allocf called for null node\n"));
819 	default:
820 		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
821 	}
822 	sx_xunlock(&mqfs->mi_lock);
823 	return (0);
824 }
825 
826 /*
827  * Search a directory entry
828  */
829 static struct mqfs_node *
830 mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred)
831 {
832 	struct mqfs_node *pn;
833 	const void *pr_root;
834 
835 	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
836 	pr_root = cred->cr_prison->pr_root;
837 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
838 		/* Only match names within the same prison root directory */
839 		if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) &&
840 		    strncmp(pn->mn_name, name, len) == 0 &&
841 		    pn->mn_name[len] == '\0')
842 			return (pn);
843 	}
844 	return (NULL);
845 }
846 
847 /*
848  * Look up a file or directory.
849  */
850 static int
851 mqfs_lookupx(struct vop_cachedlookup_args *ap)
852 {
853 	struct componentname *cnp;
854 	struct vnode *dvp, **vpp;
855 	struct mqfs_node *pd;
856 	struct mqfs_node *pn;
857 	struct mqfs_info *mqfs;
858 	int nameiop, flags, error, namelen;
859 	char *pname;
860 	struct thread *td;
861 
862 	cnp = ap->a_cnp;
863 	vpp = ap->a_vpp;
864 	dvp = ap->a_dvp;
865 	pname = cnp->cn_nameptr;
866 	namelen = cnp->cn_namelen;
867 	td = cnp->cn_thread;
868 	flags = cnp->cn_flags;
869 	nameiop = cnp->cn_nameiop;
870 	pd = VTON(dvp);
871 	pn = NULL;
872 	mqfs = pd->mn_info;
873 	*vpp = NULLVP;
874 
875 	if (dvp->v_type != VDIR)
876 		return (ENOTDIR);
877 
878 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
879 	if (error)
880 		return (error);
881 
882 	/* shortcut: check if the name is too long */
883 	if (cnp->cn_namelen >= MQFS_NAMELEN)
884 		return (ENOENT);
885 
886 	/* self */
887 	if (namelen == 1 && pname[0] == '.') {
888 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
889 			return (EINVAL);
890 		pn = pd;
891 		*vpp = dvp;
892 		VREF(dvp);
893 		return (0);
894 	}
895 
896 	/* parent */
897 	if (cnp->cn_flags & ISDOTDOT) {
898 		if (dvp->v_vflag & VV_ROOT)
899 			return (EIO);
900 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
901 			return (EINVAL);
902 		VOP_UNLOCK(dvp, 0);
903 		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
904 		pn = pd->mn_parent;
905 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
906 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
907 		return (error);
908 	}
909 
910 	/* named node */
911 	sx_xlock(&mqfs->mi_lock);
912 	pn = mqfs_search(pd, pname, namelen, cnp->cn_cred);
913 	if (pn != NULL)
914 		mqnode_addref(pn);
915 	sx_xunlock(&mqfs->mi_lock);
916 
917 	/* found */
918 	if (pn != NULL) {
919 		/* DELETE */
920 		if (nameiop == DELETE && (flags & ISLASTCN)) {
921 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
922 			if (error) {
923 				mqnode_release(pn);
924 				return (error);
925 			}
926 			if (*vpp == dvp) {
927 				VREF(dvp);
928 				*vpp = dvp;
929 				mqnode_release(pn);
930 				return (0);
931 			}
932 		}
933 
934 		/* allocate vnode */
935 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
936 		mqnode_release(pn);
937 		if (error == 0 && cnp->cn_flags & MAKEENTRY)
938 			cache_enter(dvp, *vpp, cnp);
939 		return (error);
940 	}
941 
942 	/* not found */
943 
944 	/* will create a new entry in the directory ? */
945 	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
946 	    && (flags & ISLASTCN)) {
947 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
948 		if (error)
949 			return (error);
950 		cnp->cn_flags |= SAVENAME;
951 		return (EJUSTRETURN);
952 	}
953 	return (ENOENT);
954 }
955 
956 #if 0
957 struct vop_lookup_args {
958 	struct vop_generic_args a_gen;
959 	struct vnode *a_dvp;
960 	struct vnode **a_vpp;
961 	struct componentname *a_cnp;
962 };
963 #endif
964 
965 /*
966  * vnode lookup operation
967  */
968 static int
969 mqfs_lookup(struct vop_cachedlookup_args *ap)
970 {
971 	int rc;
972 
973 	rc = mqfs_lookupx(ap);
974 	return (rc);
975 }
976 
977 #if 0
978 struct vop_create_args {
979 	struct vnode *a_dvp;
980 	struct vnode **a_vpp;
981 	struct componentname *a_cnp;
982 	struct vattr *a_vap;
983 };
984 #endif
985 
986 /*
987  * vnode creation operation
988  */
989 static int
990 mqfs_create(struct vop_create_args *ap)
991 {
992 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
993 	struct componentname *cnp = ap->a_cnp;
994 	struct mqfs_node *pd;
995 	struct mqfs_node *pn;
996 	struct mqueue *mq;
997 	int error;
998 
999 	pd = VTON(ap->a_dvp);
1000 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
1001 		return (ENOTDIR);
1002 	mq = mqueue_alloc(NULL);
1003 	if (mq == NULL)
1004 		return (EAGAIN);
1005 	sx_xlock(&mqfs->mi_lock);
1006 	if ((cnp->cn_flags & HASBUF) == 0)
1007 		panic("%s: no name", __func__);
1008 	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
1009 		cnp->cn_cred, ap->a_vap->va_mode);
1010 	if (pn == NULL) {
1011 		sx_xunlock(&mqfs->mi_lock);
1012 		error = ENOSPC;
1013 	} else {
1014 		mqnode_addref(pn);
1015 		sx_xunlock(&mqfs->mi_lock);
1016 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
1017 		mqnode_release(pn);
1018 		if (error)
1019 			mqfs_destroy(pn);
1020 		else
1021 			pn->mn_data = mq;
1022 	}
1023 	if (error)
1024 		mqueue_free(mq);
1025 	return (error);
1026 }
1027 
1028 /*
1029  * Remove an entry
1030  */
1031 static
1032 int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
1033 {
1034 	struct mqfs_node *parent;
1035 	struct mqfs_vdata *vd;
1036 	int error = 0;
1037 
1038 	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
1039 
1040 	if (ucred->cr_uid != pn->mn_uid &&
1041 	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
1042 		error = EACCES;
1043 	else if (!pn->mn_deleted) {
1044 		parent = pn->mn_parent;
1045 		pn->mn_parent = NULL;
1046 		pn->mn_deleted = 1;
1047 		LIST_REMOVE(pn, mn_sibling);
1048 		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
1049 			cache_purge(vd->mv_vnode);
1050 			vhold(vd->mv_vnode);
1051 			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
1052 		}
1053 		mqnode_release(pn);
1054 		mqnode_release(parent);
1055 	} else
1056 		error = ENOENT;
1057 	return (error);
1058 }
1059 
1060 #if 0
1061 struct vop_remove_args {
1062 	struct vnode *a_dvp;
1063 	struct vnode *a_vp;
1064 	struct componentname *a_cnp;
1065 };
1066 #endif
1067 
1068 /*
1069  * vnode removal operation
1070  */
1071 static int
1072 mqfs_remove(struct vop_remove_args *ap)
1073 {
1074 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
1075 	struct mqfs_node *pn;
1076 	int error;
1077 
1078 	if (ap->a_vp->v_type == VDIR)
1079                 return (EPERM);
1080 	pn = VTON(ap->a_vp);
1081 	sx_xlock(&mqfs->mi_lock);
1082 	error = do_unlink(pn, ap->a_cnp->cn_cred);
1083 	sx_xunlock(&mqfs->mi_lock);
1084 	return (error);
1085 }
1086 
1087 #if 0
1088 struct vop_inactive_args {
1089 	struct vnode *a_vp;
1090 	struct thread *a_td;
1091 };
1092 #endif
1093 
1094 static int
1095 mqfs_inactive(struct vop_inactive_args *ap)
1096 {
1097 	struct mqfs_node *pn = VTON(ap->a_vp);
1098 
1099 	if (pn->mn_deleted)
1100 		vrecycle(ap->a_vp);
1101 	return (0);
1102 }
1103 
1104 #if 0
1105 struct vop_reclaim_args {
1106 	struct vop_generic_args a_gen;
1107 	struct vnode *a_vp;
1108 	struct thread *a_td;
1109 };
1110 #endif
1111 
1112 static int
1113 mqfs_reclaim(struct vop_reclaim_args *ap)
1114 {
1115 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
1116 	struct vnode *vp = ap->a_vp;
1117 	struct mqfs_node *pn;
1118 	struct mqfs_vdata *vd;
1119 
1120 	vd = vp->v_data;
1121 	pn = vd->mv_node;
1122 	sx_xlock(&mqfs->mi_lock);
1123 	vp->v_data = NULL;
1124 	LIST_REMOVE(vd, mv_link);
1125 	uma_zfree(mvdata_zone, vd);
1126 	mqnode_release(pn);
1127 	sx_xunlock(&mqfs->mi_lock);
1128 	return (0);
1129 }
1130 
1131 #if 0
1132 struct vop_open_args {
1133 	struct vop_generic_args a_gen;
1134 	struct vnode *a_vp;
1135 	int a_mode;
1136 	struct ucred *a_cred;
1137 	struct thread *a_td;
1138 	struct file *a_fp;
1139 };
1140 #endif
1141 
1142 static int
1143 mqfs_open(struct vop_open_args *ap)
1144 {
1145 	return (0);
1146 }
1147 
1148 #if 0
1149 struct vop_close_args {
1150 	struct vop_generic_args a_gen;
1151 	struct vnode *a_vp;
1152 	int a_fflag;
1153 	struct ucred *a_cred;
1154 	struct thread *a_td;
1155 };
1156 #endif
1157 
1158 static int
1159 mqfs_close(struct vop_close_args *ap)
1160 {
1161 	return (0);
1162 }
1163 
1164 #if 0
1165 struct vop_access_args {
1166 	struct vop_generic_args a_gen;
1167 	struct vnode *a_vp;
1168 	accmode_t a_accmode;
1169 	struct ucred *a_cred;
1170 	struct thread *a_td;
1171 };
1172 #endif
1173 
1174 /*
1175  * Verify permissions
1176  */
1177 static int
1178 mqfs_access(struct vop_access_args *ap)
1179 {
1180 	struct vnode *vp = ap->a_vp;
1181 	struct vattr vattr;
1182 	int error;
1183 
1184 	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
1185 	if (error)
1186 		return (error);
1187 	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
1188 	    vattr.va_gid, ap->a_accmode, ap->a_cred, NULL);
1189 	return (error);
1190 }
1191 
1192 #if 0
1193 struct vop_getattr_args {
1194 	struct vop_generic_args a_gen;
1195 	struct vnode *a_vp;
1196 	struct vattr *a_vap;
1197 	struct ucred *a_cred;
1198 };
1199 #endif
1200 
1201 /*
1202  * Get file attributes
1203  */
1204 static int
1205 mqfs_getattr(struct vop_getattr_args *ap)
1206 {
1207 	struct vnode *vp = ap->a_vp;
1208 	struct mqfs_node *pn = VTON(vp);
1209 	struct vattr *vap = ap->a_vap;
1210 	int error = 0;
1211 
1212 	vap->va_type = vp->v_type;
1213 	vap->va_mode = pn->mn_mode;
1214 	vap->va_nlink = 1;
1215 	vap->va_uid = pn->mn_uid;
1216 	vap->va_gid = pn->mn_gid;
1217 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
1218 	vap->va_fileid = pn->mn_fileno;
1219 	vap->va_size = 0;
1220 	vap->va_blocksize = PAGE_SIZE;
1221 	vap->va_bytes = vap->va_size = 0;
1222 	vap->va_atime = pn->mn_atime;
1223 	vap->va_mtime = pn->mn_mtime;
1224 	vap->va_ctime = pn->mn_ctime;
1225 	vap->va_birthtime = pn->mn_birth;
1226 	vap->va_gen = 0;
1227 	vap->va_flags = 0;
1228 	vap->va_rdev = NODEV;
1229 	vap->va_bytes = 0;
1230 	vap->va_filerev = 0;
1231 	return (error);
1232 }
1233 
1234 #if 0
1235 struct vop_setattr_args {
1236 	struct vop_generic_args a_gen;
1237 	struct vnode *a_vp;
1238 	struct vattr *a_vap;
1239 	struct ucred *a_cred;
1240 };
1241 #endif
1242 /*
1243  * Set attributes
1244  */
1245 static int
1246 mqfs_setattr(struct vop_setattr_args *ap)
1247 {
1248 	struct mqfs_node *pn;
1249 	struct vattr *vap;
1250 	struct vnode *vp;
1251 	struct thread *td;
1252 	int c, error;
1253 	uid_t uid;
1254 	gid_t gid;
1255 
1256 	td = curthread;
1257 	vap = ap->a_vap;
1258 	vp = ap->a_vp;
1259 	if ((vap->va_type != VNON) ||
1260 	    (vap->va_nlink != VNOVAL) ||
1261 	    (vap->va_fsid != VNOVAL) ||
1262 	    (vap->va_fileid != VNOVAL) ||
1263 	    (vap->va_blocksize != VNOVAL) ||
1264 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
1265 	    (vap->va_rdev != VNOVAL) ||
1266 	    ((int)vap->va_bytes != VNOVAL) ||
1267 	    (vap->va_gen != VNOVAL)) {
1268 		return (EINVAL);
1269 	}
1270 
1271 	pn = VTON(vp);
1272 
1273 	error = c = 0;
1274 	if (vap->va_uid == (uid_t)VNOVAL)
1275 		uid = pn->mn_uid;
1276 	else
1277 		uid = vap->va_uid;
1278 	if (vap->va_gid == (gid_t)VNOVAL)
1279 		gid = pn->mn_gid;
1280 	else
1281 		gid = vap->va_gid;
1282 
1283 	if (uid != pn->mn_uid || gid != pn->mn_gid) {
1284 		/*
1285 		 * To modify the ownership of a file, must possess VADMIN
1286 		 * for that file.
1287 		 */
1288 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
1289 			return (error);
1290 
1291 		/*
1292 		 * XXXRW: Why is there a privilege check here: shouldn't the
1293 		 * check in VOP_ACCESS() be enough?  Also, are the group bits
1294 		 * below definitely right?
1295 		 */
1296 		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
1297 		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
1298 		    (error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
1299 			return (error);
1300 		pn->mn_uid = uid;
1301 		pn->mn_gid = gid;
1302 		c = 1;
1303 	}
1304 
1305 	if (vap->va_mode != (mode_t)VNOVAL) {
1306 		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
1307 		    (error = priv_check(td, PRIV_MQ_ADMIN)))
1308 			return (error);
1309 		pn->mn_mode = vap->va_mode;
1310 		c = 1;
1311 	}
1312 
1313 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
1314 		/* See the comment in ufs_vnops::ufs_setattr(). */
1315 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
1316 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
1317 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
1318 			return (error);
1319 		if (vap->va_atime.tv_sec != VNOVAL) {
1320 			pn->mn_atime = vap->va_atime;
1321 		}
1322 		if (vap->va_mtime.tv_sec != VNOVAL) {
1323 			pn->mn_mtime = vap->va_mtime;
1324 		}
1325 		c = 1;
1326 	}
1327 	if (c) {
1328 		vfs_timestamp(&pn->mn_ctime);
1329 	}
1330 	return (0);
1331 }
1332 
1333 #if 0
1334 struct vop_read_args {
1335 	struct vop_generic_args a_gen;
1336 	struct vnode *a_vp;
1337 	struct uio *a_uio;
1338 	int a_ioflag;
1339 	struct ucred *a_cred;
1340 };
1341 #endif
1342 
1343 /*
1344  * Read from a file
1345  */
1346 static int
1347 mqfs_read(struct vop_read_args *ap)
1348 {
1349 	char buf[80];
1350 	struct vnode *vp = ap->a_vp;
1351 	struct uio *uio = ap->a_uio;
1352 	struct mqfs_node *pn;
1353 	struct mqueue *mq;
1354 	int len, error;
1355 
1356 	if (vp->v_type != VREG)
1357 		return (EINVAL);
1358 
1359 	pn = VTON(vp);
1360 	mq = VTOMQ(vp);
1361 	snprintf(buf, sizeof(buf),
1362 		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
1363 		mq->mq_totalbytes,
1364 		mq->mq_maxmsg,
1365 		mq->mq_curmsgs,
1366 		mq->mq_msgsize);
1367 	buf[sizeof(buf)-1] = '\0';
1368 	len = strlen(buf);
1369 	error = uiomove_frombuf(buf, len, uio);
1370 	return (error);
1371 }
1372 
1373 #if 0
1374 struct vop_readdir_args {
1375 	struct vop_generic_args a_gen;
1376 	struct vnode *a_vp;
1377 	struct uio *a_uio;
1378 	struct ucred *a_cred;
1379 	int *a_eofflag;
1380 	int *a_ncookies;
1381 	u_long **a_cookies;
1382 };
1383 #endif
1384 
1385 /*
1386  * Return directory entries.
1387  */
1388 static int
1389 mqfs_readdir(struct vop_readdir_args *ap)
1390 {
1391 	struct vnode *vp;
1392 	struct mqfs_info *mi;
1393 	struct mqfs_node *pd;
1394 	struct mqfs_node *pn;
1395 	struct dirent entry;
1396 	struct uio *uio;
1397 	const void *pr_root;
1398 	int *tmp_ncookies = NULL;
1399 	off_t offset;
1400 	int error, i;
1401 
1402 	vp = ap->a_vp;
1403 	mi = VFSTOMQFS(vp->v_mount);
1404 	pd = VTON(vp);
1405 	uio = ap->a_uio;
1406 
1407 	if (vp->v_type != VDIR)
1408 		return (ENOTDIR);
1409 
1410 	if (uio->uio_offset < 0)
1411 		return (EINVAL);
1412 
1413 	if (ap->a_ncookies != NULL) {
1414 		tmp_ncookies = ap->a_ncookies;
1415 		*ap->a_ncookies = 0;
1416 		ap->a_ncookies = NULL;
1417         }
1418 
1419 	error = 0;
1420 	offset = 0;
1421 
1422 	pr_root = ap->a_cred->cr_prison->pr_root;
1423 	sx_xlock(&mi->mi_lock);
1424 
1425 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
1426 		entry.d_reclen = sizeof(entry);
1427 
1428 		/*
1429 		 * Only show names within the same prison root directory
1430 		 * (or not associated with a prison, e.g. "." and "..").
1431 		 */
1432 		if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root)
1433 			continue;
1434 		if (!pn->mn_fileno)
1435 			mqfs_fileno_alloc(mi, pn);
1436 		entry.d_fileno = pn->mn_fileno;
1437 		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
1438 			entry.d_name[i] = pn->mn_name[i];
1439 		entry.d_name[i] = 0;
1440 		entry.d_namlen = i;
1441 		switch (pn->mn_type) {
1442 		case mqfstype_root:
1443 		case mqfstype_dir:
1444 		case mqfstype_this:
1445 		case mqfstype_parent:
1446 			entry.d_type = DT_DIR;
1447 			break;
1448 		case mqfstype_file:
1449 			entry.d_type = DT_REG;
1450 			break;
1451 		case mqfstype_symlink:
1452 			entry.d_type = DT_LNK;
1453 			break;
1454 		default:
1455 			panic("%s has unexpected node type: %d", pn->mn_name,
1456 				pn->mn_type);
1457 		}
1458 		if (entry.d_reclen > uio->uio_resid)
1459                         break;
1460 		if (offset >= uio->uio_offset) {
1461 			error = vfs_read_dirent(ap, &entry, offset);
1462                         if (error)
1463                                 break;
1464                 }
1465                 offset += entry.d_reclen;
1466 	}
1467 	sx_xunlock(&mi->mi_lock);
1468 
1469 	uio->uio_offset = offset;
1470 
1471 	if (tmp_ncookies != NULL)
1472 		ap->a_ncookies = tmp_ncookies;
1473 
1474 	return (error);
1475 }
1476 
1477 #ifdef notyet
1478 
1479 #if 0
1480 struct vop_mkdir_args {
1481 	struct vnode *a_dvp;
1482 	struvt vnode **a_vpp;
1483 	struvt componentname *a_cnp;
1484 	struct vattr *a_vap;
1485 };
1486 #endif
1487 
1488 /*
1489  * Create a directory.
1490  */
1491 static int
1492 mqfs_mkdir(struct vop_mkdir_args *ap)
1493 {
1494 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
1495 	struct componentname *cnp = ap->a_cnp;
1496 	struct mqfs_node *pd = VTON(ap->a_dvp);
1497 	struct mqfs_node *pn;
1498 	int error;
1499 
1500 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
1501 		return (ENOTDIR);
1502 	sx_xlock(&mqfs->mi_lock);
1503 	if ((cnp->cn_flags & HASBUF) == 0)
1504 		panic("%s: no name", __func__);
1505 	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
1506 		ap->a_vap->cn_cred, ap->a_vap->va_mode);
1507 	if (pn != NULL)
1508 		mqnode_addref(pn);
1509 	sx_xunlock(&mqfs->mi_lock);
1510 	if (pn == NULL) {
1511 		error = ENOSPC;
1512 	} else {
1513 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
1514 		mqnode_release(pn);
1515 	}
1516 	return (error);
1517 }
1518 
1519 #if 0
1520 struct vop_rmdir_args {
1521 	struct vnode *a_dvp;
1522 	struct vnode *a_vp;
1523 	struct componentname *a_cnp;
1524 };
1525 #endif
1526 
1527 /*
1528  * Remove a directory.
1529  */
1530 static int
1531 mqfs_rmdir(struct vop_rmdir_args *ap)
1532 {
1533 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
1534 	struct mqfs_node *pn = VTON(ap->a_vp);
1535 	struct mqfs_node *pt;
1536 
1537 	if (pn->mn_type != mqfstype_dir)
1538 		return (ENOTDIR);
1539 
1540 	sx_xlock(&mqfs->mi_lock);
1541 	if (pn->mn_deleted) {
1542 		sx_xunlock(&mqfs->mi_lock);
1543 		return (ENOENT);
1544 	}
1545 
1546 	pt = LIST_FIRST(&pn->mn_children);
1547 	pt = LIST_NEXT(pt, mn_sibling);
1548 	pt = LIST_NEXT(pt, mn_sibling);
1549 	if (pt != NULL) {
1550 		sx_xunlock(&mqfs->mi_lock);
1551 		return (ENOTEMPTY);
1552 	}
1553 	pt = pn->mn_parent;
1554 	pn->mn_parent = NULL;
1555 	pn->mn_deleted = 1;
1556 	LIST_REMOVE(pn, mn_sibling);
1557 	mqnode_release(pn);
1558 	mqnode_release(pt);
1559 	sx_xunlock(&mqfs->mi_lock);
1560 	cache_purge(ap->a_vp);
1561 	return (0);
1562 }
1563 
1564 #endif /* notyet */
1565 
1566 
1567 /*
1568  * Set a destructor task with the prison's root
1569  */
1570 static int
1571 mqfs_prison_create(void *obj, void *data __unused)
1572 {
1573 	struct prison *pr = obj;
1574 	struct mqfs_osd *mo;
1575 	void *rsv;
1576 
1577 	if (pr->pr_root == pr->pr_parent->pr_root)
1578 		return(0);
1579 
1580 	mo = malloc(sizeof(struct mqfs_osd), M_PRISON, M_WAITOK);
1581 	rsv = osd_reserve(mqfs_osd_jail_slot);
1582 	TASK_INIT(&mo->mo_task, 0, mqfs_prison_remove_task, mo);
1583 	mtx_lock(&pr->pr_mtx);
1584 	mo->mo_pr_root = pr->pr_root;
1585 	(void)osd_jail_set_reserved(pr, mqfs_osd_jail_slot, rsv, mo);
1586 	mtx_unlock(&pr->pr_mtx);
1587 	return (0);
1588 }
1589 
1590 /*
1591  * Queue the task for after jail/OSD locks are released
1592  */
1593 static void
1594 mqfs_prison_destructor(void *data)
1595 {
1596 	struct mqfs_osd *mo = data;
1597 
1598 	if (mqfs_osd_jail_slot != 0)
1599 		taskqueue_enqueue(taskqueue_thread, &mo->mo_task);
1600 	else
1601 		free(mo, M_PRISON);
1602 }
1603 
1604 /*
1605  * See if this prison root is obsolete, and clean up associated queues if it is
1606  */
1607 static void
1608 mqfs_prison_remove_task(void *context, int pending)
1609 {
1610 	struct mqfs_osd *mo = context;
1611 	struct mqfs_node *pn, *tpn;
1612 	const struct prison *pr;
1613 	const void *pr_root;
1614 	int found;
1615 
1616 	pr_root = mo->mo_pr_root;
1617 	found = 0;
1618 	sx_slock(&allprison_lock);
1619 	TAILQ_FOREACH(pr, &allprison, pr_list) {
1620 		if (pr->pr_root == pr_root)
1621 			found = 1;
1622 	}
1623 	sx_sunlock(&allprison_lock);
1624 	if (!found) {
1625 		/*
1626 		 * No jails are rooted in this directory anymore,
1627 		 * so no queues should be either.
1628 		 */
1629 		sx_xlock(&mqfs_data.mi_lock);
1630 		LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
1631 		    mn_sibling, tpn) {
1632 			if (pn->mn_pr_root == pr_root)
1633 				(void)do_unlink(pn, curthread->td_ucred);
1634 		}
1635 		sx_xunlock(&mqfs_data.mi_lock);
1636 	}
1637 	free(mo, M_PRISON);
1638 }
1639 
1640 
1641 /*
1642  * Allocate a message queue
1643  */
1644 static struct mqueue *
1645 mqueue_alloc(const struct mq_attr *attr)
1646 {
1647 	struct mqueue *mq;
1648 
1649 	if (curmq >= maxmq)
1650 		return (NULL);
1651 	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
1652 	TAILQ_INIT(&mq->mq_msgq);
1653 	if (attr != NULL) {
1654 		mq->mq_maxmsg = attr->mq_maxmsg;
1655 		mq->mq_msgsize = attr->mq_msgsize;
1656 	} else {
1657 		mq->mq_maxmsg = default_maxmsg;
1658 		mq->mq_msgsize = default_msgsize;
1659 	}
1660 	mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
1661 	knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
1662 	knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
1663 	atomic_add_int(&curmq, 1);
1664 	return (mq);
1665 }
1666 
1667 /*
1668  * Destroy a message queue
1669  */
1670 static void
1671 mqueue_free(struct mqueue *mq)
1672 {
1673 	struct mqueue_msg *msg;
1674 
1675 	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
1676 		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
1677 		free(msg, M_MQUEUEDATA);
1678 	}
1679 
1680 	mtx_destroy(&mq->mq_mutex);
1681 	seldrain(&mq->mq_rsel);
1682 	seldrain(&mq->mq_wsel);
1683 	knlist_destroy(&mq->mq_rsel.si_note);
1684 	knlist_destroy(&mq->mq_wsel.si_note);
1685 	uma_zfree(mqueue_zone, mq);
1686 	atomic_add_int(&curmq, -1);
1687 }
1688 
1689 /*
1690  * Load a message from user space
1691  */
1692 static struct mqueue_msg *
1693 mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
1694 {
1695 	struct mqueue_msg *msg;
1696 	size_t len;
1697 	int error;
1698 
1699 	len = sizeof(struct mqueue_msg) + msg_size;
1700 	msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
1701 	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
1702 	    msg_size);
1703 	if (error) {
1704 		free(msg, M_MQUEUEDATA);
1705 		msg = NULL;
1706 	} else {
1707 		msg->msg_size = msg_size;
1708 		msg->msg_prio = msg_prio;
1709 	}
1710 	return (msg);
1711 }
1712 
1713 /*
1714  * Save a message to user space
1715  */
1716 static int
1717 mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
1718 {
1719 	int error;
1720 
1721 	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
1722 		msg->msg_size);
1723 	if (error == 0 && msg_prio != NULL)
1724 		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
1725 	return (error);
1726 }
1727 
1728 /*
1729  * Free a message's memory
1730  */
1731 static __inline void
1732 mqueue_freemsg(struct mqueue_msg *msg)
1733 {
1734 	free(msg, M_MQUEUEDATA);
1735 }
1736 
1737 /*
1738  * Send a message. if waitok is false, thread will not be
1739  * blocked if there is no data in queue, otherwise, absolute
1740  * time will be checked.
1741  */
1742 int
1743 mqueue_send(struct mqueue *mq, const char *msg_ptr,
1744 	size_t msg_len, unsigned msg_prio, int waitok,
1745 	const struct timespec *abs_timeout)
1746 {
1747 	struct mqueue_msg *msg;
1748 	struct timespec ts, ts2;
1749 	struct timeval tv;
1750 	int error;
1751 
1752 	if (msg_prio >= MQ_PRIO_MAX)
1753 		return (EINVAL);
1754 	if (msg_len > mq->mq_msgsize)
1755 		return (EMSGSIZE);
1756 	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
1757 	if (msg == NULL)
1758 		return (EFAULT);
1759 
1760 	/* O_NONBLOCK case */
1761 	if (!waitok) {
1762 		error = _mqueue_send(mq, msg, -1);
1763 		if (error)
1764 			goto bad;
1765 		return (0);
1766 	}
1767 
1768 	/* we allow a null timeout (wait forever) */
1769 	if (abs_timeout == NULL) {
1770 		error = _mqueue_send(mq, msg, 0);
1771 		if (error)
1772 			goto bad;
1773 		return (0);
1774 	}
1775 
1776 	/* send it before checking time */
1777 	error = _mqueue_send(mq, msg, -1);
1778 	if (error == 0)
1779 		return (0);
1780 
1781 	if (error != EAGAIN)
1782 		goto bad;
1783 
1784 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
1785 		error = EINVAL;
1786 		goto bad;
1787 	}
1788 	for (;;) {
1789 		ts2 = *abs_timeout;
1790 		getnanotime(&ts);
1791 		timespecsub(&ts2, &ts);
1792 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
1793 			error = ETIMEDOUT;
1794 			break;
1795 		}
1796 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
1797 		error = _mqueue_send(mq, msg, tvtohz(&tv));
1798 		if (error != ETIMEDOUT)
1799 			break;
1800 	}
1801 	if (error == 0)
1802 		return (0);
1803 bad:
1804 	mqueue_freemsg(msg);
1805 	return (error);
1806 }
1807 
1808 /*
1809  * Common routine to send a message
1810  */
1811 static int
1812 _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
1813 {
1814 	struct mqueue_msg *msg2;
1815 	int error = 0;
1816 
1817 	mtx_lock(&mq->mq_mutex);
1818 	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
1819 		if (timo < 0) {
1820 			mtx_unlock(&mq->mq_mutex);
1821 			return (EAGAIN);
1822 		}
1823 		mq->mq_senders++;
1824 		error = msleep(&mq->mq_senders, &mq->mq_mutex,
1825 			    PCATCH, "mqsend", timo);
1826 		mq->mq_senders--;
1827 		if (error == EAGAIN)
1828 			error = ETIMEDOUT;
1829 	}
1830 	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
1831 		mtx_unlock(&mq->mq_mutex);
1832 		return (error);
1833 	}
1834 	error = 0;
1835 	if (TAILQ_EMPTY(&mq->mq_msgq)) {
1836 		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
1837 	} else {
1838 		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
1839 			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
1840 		} else {
1841 			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
1842 				if (msg2->msg_prio < msg->msg_prio)
1843 					break;
1844 			}
1845 			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
1846 		}
1847 	}
1848 	mq->mq_curmsgs++;
1849 	mq->mq_totalbytes += msg->msg_size;
1850 	if (mq->mq_receivers)
1851 		wakeup_one(&mq->mq_receivers);
1852 	else if (mq->mq_notifier != NULL)
1853 		mqueue_send_notification(mq);
1854 	if (mq->mq_flags & MQ_RSEL) {
1855 		mq->mq_flags &= ~MQ_RSEL;
1856 		selwakeup(&mq->mq_rsel);
1857 	}
1858 	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
1859 	mtx_unlock(&mq->mq_mutex);
1860 	return (0);
1861 }
1862 
1863 /*
1864  * Send realtime a signal to process which registered itself
1865  * successfully by mq_notify.
1866  */
1867 static void
1868 mqueue_send_notification(struct mqueue *mq)
1869 {
1870 	struct mqueue_notifier *nt;
1871 	struct thread *td;
1872 	struct proc *p;
1873 	int error;
1874 
1875 	mtx_assert(&mq->mq_mutex, MA_OWNED);
1876 	nt = mq->mq_notifier;
1877 	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
1878 		p = nt->nt_proc;
1879 		error = sigev_findtd(p, &nt->nt_sigev, &td);
1880 		if (error) {
1881 			mq->mq_notifier = NULL;
1882 			return;
1883 		}
1884 		if (!KSI_ONQ(&nt->nt_ksi)) {
1885 			ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
1886 			tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
1887 		}
1888 		PROC_UNLOCK(p);
1889 	}
1890 	mq->mq_notifier = NULL;
1891 }
1892 
1893 /*
1894  * Get a message. if waitok is false, thread will not be
1895  * blocked if there is no data in queue, otherwise, absolute
1896  * time will be checked.
1897  */
1898 int
1899 mqueue_receive(struct mqueue *mq, char *msg_ptr,
1900 	size_t msg_len, unsigned *msg_prio, int waitok,
1901 	const struct timespec *abs_timeout)
1902 {
1903 	struct mqueue_msg *msg;
1904 	struct timespec ts, ts2;
1905 	struct timeval tv;
1906 	int error;
1907 
1908 	if (msg_len < mq->mq_msgsize)
1909 		return (EMSGSIZE);
1910 
1911 	/* O_NONBLOCK case */
1912 	if (!waitok) {
1913 		error = _mqueue_recv(mq, &msg, -1);
1914 		if (error)
1915 			return (error);
1916 		goto received;
1917 	}
1918 
1919 	/* we allow a null timeout (wait forever). */
1920 	if (abs_timeout == NULL) {
1921 		error = _mqueue_recv(mq, &msg, 0);
1922 		if (error)
1923 			return (error);
1924 		goto received;
1925 	}
1926 
1927 	/* try to get a message before checking time */
1928 	error = _mqueue_recv(mq, &msg, -1);
1929 	if (error == 0)
1930 		goto received;
1931 
1932 	if (error != EAGAIN)
1933 		return (error);
1934 
1935 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
1936 		error = EINVAL;
1937 		return (error);
1938 	}
1939 
1940 	for (;;) {
1941 		ts2 = *abs_timeout;
1942 		getnanotime(&ts);
1943 		timespecsub(&ts2, &ts);
1944 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
1945 			error = ETIMEDOUT;
1946 			return (error);
1947 		}
1948 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
1949 		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
1950 		if (error == 0)
1951 			break;
1952 		if (error != ETIMEDOUT)
1953 			return (error);
1954 	}
1955 
1956 received:
1957 	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
1958 	if (error == 0) {
1959 		curthread->td_retval[0] = msg->msg_size;
1960 		curthread->td_retval[1] = 0;
1961 	}
1962 	mqueue_freemsg(msg);
1963 	return (error);
1964 }
1965 
1966 /*
1967  * Common routine to receive a message
1968  */
1969 static int
1970 _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
1971 {
1972 	int error = 0;
1973 
1974 	mtx_lock(&mq->mq_mutex);
1975 	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
1976 		if (timo < 0) {
1977 			mtx_unlock(&mq->mq_mutex);
1978 			return (EAGAIN);
1979 		}
1980 		mq->mq_receivers++;
1981 		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
1982 			    PCATCH, "mqrecv", timo);
1983 		mq->mq_receivers--;
1984 		if (error == EAGAIN)
1985 			error = ETIMEDOUT;
1986 	}
1987 	if (*msg != NULL) {
1988 		error = 0;
1989 		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
1990 		mq->mq_curmsgs--;
1991 		mq->mq_totalbytes -= (*msg)->msg_size;
1992 		if (mq->mq_senders)
1993 			wakeup_one(&mq->mq_senders);
1994 		if (mq->mq_flags & MQ_WSEL) {
1995 			mq->mq_flags &= ~MQ_WSEL;
1996 			selwakeup(&mq->mq_wsel);
1997 		}
1998 		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
1999 	}
2000 	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
2001 	    !TAILQ_EMPTY(&mq->mq_msgq)) {
2002 		mqueue_send_notification(mq);
2003 	}
2004 	mtx_unlock(&mq->mq_mutex);
2005 	return (error);
2006 }
2007 
2008 static __inline struct mqueue_notifier *
2009 notifier_alloc(void)
2010 {
2011 	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
2012 }
2013 
2014 static __inline void
2015 notifier_free(struct mqueue_notifier *p)
2016 {
2017 	uma_zfree(mqnoti_zone, p);
2018 }
2019 
2020 static struct mqueue_notifier *
2021 notifier_search(struct proc *p, int fd)
2022 {
2023 	struct mqueue_notifier *nt;
2024 
2025 	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
2026 		if (nt->nt_ksi.ksi_mqd == fd)
2027 			break;
2028 	}
2029 	return (nt);
2030 }
2031 
2032 static __inline void
2033 notifier_insert(struct proc *p, struct mqueue_notifier *nt)
2034 {
2035 	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
2036 }
2037 
2038 static __inline void
2039 notifier_delete(struct proc *p, struct mqueue_notifier *nt)
2040 {
2041 	LIST_REMOVE(nt, nt_link);
2042 	notifier_free(nt);
2043 }
2044 
2045 static void
2046 notifier_remove(struct proc *p, struct mqueue *mq, int fd)
2047 {
2048 	struct mqueue_notifier *nt;
2049 
2050 	mtx_assert(&mq->mq_mutex, MA_OWNED);
2051 	PROC_LOCK(p);
2052 	nt = notifier_search(p, fd);
2053 	if (nt != NULL) {
2054 		if (mq->mq_notifier == nt)
2055 			mq->mq_notifier = NULL;
2056 		sigqueue_take(&nt->nt_ksi);
2057 		notifier_delete(p, nt);
2058 	}
2059 	PROC_UNLOCK(p);
2060 }
2061 
2062 static int
2063 kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
2064     const struct mq_attr *attr)
2065 {
2066 	char path[MQFS_NAMELEN + 1];
2067 	struct mqfs_node *pn;
2068 	struct filedesc *fdp;
2069 	struct file *fp;
2070 	struct mqueue *mq;
2071 	int fd, error, len, cmode;
2072 
2073 	fdp = td->td_proc->p_fd;
2074 	cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
2075 	mq = NULL;
2076 	if ((flags & O_CREAT) != 0 && attr != NULL) {
2077 		if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
2078 			return (EINVAL);
2079 		if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
2080 			return (EINVAL);
2081 	}
2082 
2083 	error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
2084         if (error)
2085 		return (error);
2086 
2087 	/*
2088 	 * The first character of name must be a slash  (/) character
2089 	 * and the remaining characters of name cannot include any slash
2090 	 * characters.
2091 	 */
2092 	len = strlen(path);
2093 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
2094 		return (EINVAL);
2095 
2096 	error = falloc(td, &fp, &fd, O_CLOEXEC);
2097 	if (error)
2098 		return (error);
2099 
2100 	sx_xlock(&mqfs_data.mi_lock);
2101 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
2102 	if (pn == NULL) {
2103 		if (!(flags & O_CREAT)) {
2104 			error = ENOENT;
2105 		} else {
2106 			mq = mqueue_alloc(attr);
2107 			if (mq == NULL) {
2108 				error = ENFILE;
2109 			} else {
2110 				pn = mqfs_create_file(mqfs_data.mi_root,
2111 				         path + 1, len - 1, td->td_ucred,
2112 					 cmode);
2113 				if (pn == NULL) {
2114 					error = ENOSPC;
2115 					mqueue_free(mq);
2116 				}
2117 			}
2118 		}
2119 
2120 		if (error == 0) {
2121 			pn->mn_data = mq;
2122 		}
2123 	} else {
2124 		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
2125 			error = EEXIST;
2126 		} else {
2127 			accmode_t accmode = 0;
2128 
2129 			if (flags & FREAD)
2130 				accmode |= VREAD;
2131 			if (flags & FWRITE)
2132 				accmode |= VWRITE;
2133 			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
2134 				    pn->mn_gid, accmode, td->td_ucred, NULL);
2135 		}
2136 	}
2137 
2138 	if (error) {
2139 		sx_xunlock(&mqfs_data.mi_lock);
2140 		fdclose(td, fp, fd);
2141 		fdrop(fp, td);
2142 		return (error);
2143 	}
2144 
2145 	mqnode_addref(pn);
2146 	sx_xunlock(&mqfs_data.mi_lock);
2147 
2148 	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
2149 	    &mqueueops);
2150 
2151 	td->td_retval[0] = fd;
2152 	fdrop(fp, td);
2153 	return (0);
2154 }
2155 
2156 /*
2157  * Syscall to open a message queue.
2158  */
2159 int
2160 sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
2161 {
2162 	struct mq_attr attr;
2163 	int flags, error;
2164 
2165 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
2166 		return (EINVAL);
2167 	flags = FFLAGS(uap->flags);
2168 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
2169 		error = copyin(uap->attr, &attr, sizeof(attr));
2170 		if (error)
2171 			return (error);
2172 	}
2173 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
2174 	    uap->attr != NULL ? &attr : NULL));
2175 }
2176 
2177 /*
2178  * Syscall to unlink a message queue.
2179  */
2180 int
2181 sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
2182 {
2183 	char path[MQFS_NAMELEN+1];
2184 	struct mqfs_node *pn;
2185 	int error, len;
2186 
2187 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
2188         if (error)
2189 		return (error);
2190 
2191 	len = strlen(path);
2192 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
2193 		return (EINVAL);
2194 
2195 	sx_xlock(&mqfs_data.mi_lock);
2196 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
2197 	if (pn != NULL)
2198 		error = do_unlink(pn, td->td_ucred);
2199 	else
2200 		error = ENOENT;
2201 	sx_xunlock(&mqfs_data.mi_lock);
2202 	return (error);
2203 }
2204 
2205 typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
2206 
2207 /*
2208  * Get message queue by giving file slot
2209  */
2210 static int
2211 _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
2212        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
2213 {
2214 	struct mqfs_node *pn;
2215 	int error;
2216 
2217 	error = func(td, fd, rightsp, fpp);
2218 	if (error)
2219 		return (error);
2220 	if (&mqueueops != (*fpp)->f_ops) {
2221 		fdrop(*fpp, td);
2222 		return (EBADF);
2223 	}
2224 	pn = (*fpp)->f_data;
2225 	if (ppn)
2226 		*ppn = pn;
2227 	if (pmq)
2228 		*pmq = pn->mn_data;
2229 	return (0);
2230 }
2231 
2232 static __inline int
2233 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
2234 	struct mqueue **pmq)
2235 {
2236 	cap_rights_t rights;
2237 
2238 	return _getmq(td, fd, cap_rights_init(&rights, CAP_EVENT), fget,
2239 	    fpp, ppn, pmq);
2240 }
2241 
2242 static __inline int
2243 getmq_read(struct thread *td, int fd, struct file **fpp,
2244 	 struct mqfs_node **ppn, struct mqueue **pmq)
2245 {
2246 	cap_rights_t rights;
2247 
2248 	return _getmq(td, fd, cap_rights_init(&rights, CAP_READ), fget_read,
2249 	    fpp, ppn, pmq);
2250 }
2251 
2252 static __inline int
2253 getmq_write(struct thread *td, int fd, struct file **fpp,
2254 	struct mqfs_node **ppn, struct mqueue **pmq)
2255 {
2256 	cap_rights_t rights;
2257 
2258 	return _getmq(td, fd, cap_rights_init(&rights, CAP_WRITE), fget_write,
2259 	    fpp, ppn, pmq);
2260 }
2261 
2262 static int
2263 kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
2264     struct mq_attr *oattr)
2265 {
2266 	struct mqueue *mq;
2267 	struct file *fp;
2268 	u_int oflag, flag;
2269 	int error;
2270 
2271 	if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
2272 		return (EINVAL);
2273 	error = getmq(td, mqd, &fp, NULL, &mq);
2274 	if (error)
2275 		return (error);
2276 	oattr->mq_maxmsg  = mq->mq_maxmsg;
2277 	oattr->mq_msgsize = mq->mq_msgsize;
2278 	oattr->mq_curmsgs = mq->mq_curmsgs;
2279 	if (attr != NULL) {
2280 		do {
2281 			oflag = flag = fp->f_flag;
2282 			flag &= ~O_NONBLOCK;
2283 			flag |= (attr->mq_flags & O_NONBLOCK);
2284 		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
2285 	} else
2286 		oflag = fp->f_flag;
2287 	oattr->mq_flags = (O_NONBLOCK & oflag);
2288 	fdrop(fp, td);
2289 	return (error);
2290 }
2291 
2292 int
2293 sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
2294 {
2295 	struct mq_attr attr, oattr;
2296 	int error;
2297 
2298 	if (uap->attr != NULL) {
2299 		error = copyin(uap->attr, &attr, sizeof(attr));
2300 		if (error != 0)
2301 			return (error);
2302 	}
2303 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
2304 	    &oattr);
2305 	if (error != 0)
2306 		return (error);
2307 	if (uap->oattr != NULL)
2308 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
2309 	return (error);
2310 }
2311 
2312 int
2313 sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
2314 {
2315 	struct mqueue *mq;
2316 	struct file *fp;
2317 	struct timespec *abs_timeout, ets;
2318 	int error;
2319 	int waitok;
2320 
2321 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
2322 	if (error)
2323 		return (error);
2324 	if (uap->abs_timeout != NULL) {
2325 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
2326 		if (error != 0)
2327 			return (error);
2328 		abs_timeout = &ets;
2329 	} else
2330 		abs_timeout = NULL;
2331 	waitok = !(fp->f_flag & O_NONBLOCK);
2332 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
2333 		uap->msg_prio, waitok, abs_timeout);
2334 	fdrop(fp, td);
2335 	return (error);
2336 }
2337 
2338 int
2339 sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
2340 {
2341 	struct mqueue *mq;
2342 	struct file *fp;
2343 	struct timespec *abs_timeout, ets;
2344 	int error, waitok;
2345 
2346 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
2347 	if (error)
2348 		return (error);
2349 	if (uap->abs_timeout != NULL) {
2350 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
2351 		if (error != 0)
2352 			return (error);
2353 		abs_timeout = &ets;
2354 	} else
2355 		abs_timeout = NULL;
2356 	waitok = !(fp->f_flag & O_NONBLOCK);
2357 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
2358 		uap->msg_prio, waitok, abs_timeout);
2359 	fdrop(fp, td);
2360 	return (error);
2361 }
2362 
2363 static int
2364 kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
2365 {
2366 #ifdef CAPABILITIES
2367 	cap_rights_t rights;
2368 #endif
2369 	struct filedesc *fdp;
2370 	struct proc *p;
2371 	struct mqueue *mq;
2372 	struct file *fp, *fp2;
2373 	struct mqueue_notifier *nt, *newnt = NULL;
2374 	int error;
2375 
2376 	if (sigev != NULL) {
2377 		if (sigev->sigev_notify != SIGEV_SIGNAL &&
2378 		    sigev->sigev_notify != SIGEV_THREAD_ID &&
2379 		    sigev->sigev_notify != SIGEV_NONE)
2380 			return (EINVAL);
2381 		if ((sigev->sigev_notify == SIGEV_SIGNAL ||
2382 		    sigev->sigev_notify == SIGEV_THREAD_ID) &&
2383 		    !_SIG_VALID(sigev->sigev_signo))
2384 			return (EINVAL);
2385 	}
2386 	p = td->td_proc;
2387 	fdp = td->td_proc->p_fd;
2388 	error = getmq(td, mqd, &fp, NULL, &mq);
2389 	if (error)
2390 		return (error);
2391 again:
2392 	FILEDESC_SLOCK(fdp);
2393 	fp2 = fget_locked(fdp, mqd);
2394 	if (fp2 == NULL) {
2395 		FILEDESC_SUNLOCK(fdp);
2396 		error = EBADF;
2397 		goto out;
2398 	}
2399 #ifdef CAPABILITIES
2400 	error = cap_check(cap_rights(fdp, mqd),
2401 	    cap_rights_init(&rights, CAP_EVENT));
2402 	if (error) {
2403 		FILEDESC_SUNLOCK(fdp);
2404 		goto out;
2405 	}
2406 #endif
2407 	if (fp2 != fp) {
2408 		FILEDESC_SUNLOCK(fdp);
2409 		error = EBADF;
2410 		goto out;
2411 	}
2412 	mtx_lock(&mq->mq_mutex);
2413 	FILEDESC_SUNLOCK(fdp);
2414 	if (sigev != NULL) {
2415 		if (mq->mq_notifier != NULL) {
2416 			error = EBUSY;
2417 		} else {
2418 			PROC_LOCK(p);
2419 			nt = notifier_search(p, mqd);
2420 			if (nt == NULL) {
2421 				if (newnt == NULL) {
2422 					PROC_UNLOCK(p);
2423 					mtx_unlock(&mq->mq_mutex);
2424 					newnt = notifier_alloc();
2425 					goto again;
2426 				}
2427 			}
2428 
2429 			if (nt != NULL) {
2430 				sigqueue_take(&nt->nt_ksi);
2431 				if (newnt != NULL) {
2432 					notifier_free(newnt);
2433 					newnt = NULL;
2434 				}
2435 			} else {
2436 				nt = newnt;
2437 				newnt = NULL;
2438 				ksiginfo_init(&nt->nt_ksi);
2439 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
2440 				nt->nt_ksi.ksi_code = SI_MESGQ;
2441 				nt->nt_proc = p;
2442 				nt->nt_ksi.ksi_mqd = mqd;
2443 				notifier_insert(p, nt);
2444 			}
2445 			nt->nt_sigev = *sigev;
2446 			mq->mq_notifier = nt;
2447 			PROC_UNLOCK(p);
2448 			/*
2449 			 * if there is no receivers and message queue
2450 			 * is not empty, we should send notification
2451 			 * as soon as possible.
2452 			 */
2453 			if (mq->mq_receivers == 0 &&
2454 			    !TAILQ_EMPTY(&mq->mq_msgq))
2455 				mqueue_send_notification(mq);
2456 		}
2457 	} else {
2458 		notifier_remove(p, mq, mqd);
2459 	}
2460 	mtx_unlock(&mq->mq_mutex);
2461 
2462 out:
2463 	fdrop(fp, td);
2464 	if (newnt != NULL)
2465 		notifier_free(newnt);
2466 	return (error);
2467 }
2468 
2469 int
2470 sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
2471 {
2472 	struct sigevent ev, *evp;
2473 	int error;
2474 
2475 	if (uap->sigev == NULL) {
2476 		evp = NULL;
2477 	} else {
2478 		error = copyin(uap->sigev, &ev, sizeof(ev));
2479 		if (error != 0)
2480 			return (error);
2481 		evp = &ev;
2482 	}
2483 	return (kern_kmq_notify(td, uap->mqd, evp));
2484 }
2485 
2486 static void
2487 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
2488 {
2489 	struct filedesc *fdp;
2490 	struct mqueue *mq;
2491 
2492 	fdp = td->td_proc->p_fd;
2493 	FILEDESC_LOCK_ASSERT(fdp);
2494 
2495 	if (fp->f_ops == &mqueueops) {
2496 		mq = FPTOMQ(fp);
2497 		mtx_lock(&mq->mq_mutex);
2498 		notifier_remove(td->td_proc, mq, fd);
2499 
2500 		/* have to wakeup thread in same process */
2501 		if (mq->mq_flags & MQ_RSEL) {
2502 			mq->mq_flags &= ~MQ_RSEL;
2503 			selwakeup(&mq->mq_rsel);
2504 		}
2505 		if (mq->mq_flags & MQ_WSEL) {
2506 			mq->mq_flags &= ~MQ_WSEL;
2507 			selwakeup(&mq->mq_wsel);
2508 		}
2509 		mtx_unlock(&mq->mq_mutex);
2510 	}
2511 }
2512 
2513 static void
2514 mq_proc_exit(void *arg __unused, struct proc *p)
2515 {
2516 	struct filedesc *fdp;
2517 	struct file *fp;
2518 	struct mqueue *mq;
2519 	int i;
2520 
2521 	fdp = p->p_fd;
2522 	FILEDESC_SLOCK(fdp);
2523 	for (i = 0; i < fdp->fd_nfiles; ++i) {
2524 		fp = fget_locked(fdp, i);
2525 		if (fp != NULL && fp->f_ops == &mqueueops) {
2526 			mq = FPTOMQ(fp);
2527 			mtx_lock(&mq->mq_mutex);
2528 			notifier_remove(p, FPTOMQ(fp), i);
2529 			mtx_unlock(&mq->mq_mutex);
2530 		}
2531 	}
2532 	FILEDESC_SUNLOCK(fdp);
2533 	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
2534 }
2535 
2536 static int
2537 mqf_poll(struct file *fp, int events, struct ucred *active_cred,
2538 	struct thread *td)
2539 {
2540 	struct mqueue *mq = FPTOMQ(fp);
2541 	int revents = 0;
2542 
2543 	mtx_lock(&mq->mq_mutex);
2544 	if (events & (POLLIN | POLLRDNORM)) {
2545 		if (mq->mq_curmsgs) {
2546 			revents |= events & (POLLIN | POLLRDNORM);
2547 		} else {
2548 			mq->mq_flags |= MQ_RSEL;
2549 			selrecord(td, &mq->mq_rsel);
2550  		}
2551 	}
2552 	if (events & POLLOUT) {
2553 		if (mq->mq_curmsgs < mq->mq_maxmsg)
2554 			revents |= POLLOUT;
2555 		else {
2556 			mq->mq_flags |= MQ_WSEL;
2557 			selrecord(td, &mq->mq_wsel);
2558 		}
2559 	}
2560 	mtx_unlock(&mq->mq_mutex);
2561 	return (revents);
2562 }
2563 
2564 static int
2565 mqf_close(struct file *fp, struct thread *td)
2566 {
2567 	struct mqfs_node *pn;
2568 
2569 	fp->f_ops = &badfileops;
2570 	pn = fp->f_data;
2571 	fp->f_data = NULL;
2572 	sx_xlock(&mqfs_data.mi_lock);
2573 	mqnode_release(pn);
2574 	sx_xunlock(&mqfs_data.mi_lock);
2575 	return (0);
2576 }
2577 
2578 static int
2579 mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
2580 	struct thread *td)
2581 {
2582 	struct mqfs_node *pn = fp->f_data;
2583 
2584 	bzero(st, sizeof *st);
2585 	sx_xlock(&mqfs_data.mi_lock);
2586 	st->st_atim = pn->mn_atime;
2587 	st->st_mtim = pn->mn_mtime;
2588 	st->st_ctim = pn->mn_ctime;
2589 	st->st_birthtim = pn->mn_birth;
2590 	st->st_uid = pn->mn_uid;
2591 	st->st_gid = pn->mn_gid;
2592 	st->st_mode = S_IFIFO | pn->mn_mode;
2593 	sx_xunlock(&mqfs_data.mi_lock);
2594 	return (0);
2595 }
2596 
2597 static int
2598 mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
2599     struct thread *td)
2600 {
2601 	struct mqfs_node *pn;
2602 	int error;
2603 
2604 	error = 0;
2605 	pn = fp->f_data;
2606 	sx_xlock(&mqfs_data.mi_lock);
2607 	error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
2608 	    active_cred, NULL);
2609 	if (error != 0)
2610 		goto out;
2611 	pn->mn_mode = mode & ACCESSPERMS;
2612 out:
2613 	sx_xunlock(&mqfs_data.mi_lock);
2614 	return (error);
2615 }
2616 
2617 static int
2618 mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
2619     struct thread *td)
2620 {
2621 	struct mqfs_node *pn;
2622 	int error;
2623 
2624 	error = 0;
2625 	pn = fp->f_data;
2626 	sx_xlock(&mqfs_data.mi_lock);
2627 	if (uid == (uid_t)-1)
2628 		uid = pn->mn_uid;
2629 	if (gid == (gid_t)-1)
2630 		gid = pn->mn_gid;
2631 	if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
2632 	    (gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
2633 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
2634 		goto out;
2635 	pn->mn_uid = uid;
2636 	pn->mn_gid = gid;
2637 out:
2638 	sx_xunlock(&mqfs_data.mi_lock);
2639 	return (error);
2640 }
2641 
2642 static int
2643 mqf_kqfilter(struct file *fp, struct knote *kn)
2644 {
2645 	struct mqueue *mq = FPTOMQ(fp);
2646 	int error = 0;
2647 
2648 	if (kn->kn_filter == EVFILT_READ) {
2649 		kn->kn_fop = &mq_rfiltops;
2650 		knlist_add(&mq->mq_rsel.si_note, kn, 0);
2651 	} else if (kn->kn_filter == EVFILT_WRITE) {
2652 		kn->kn_fop = &mq_wfiltops;
2653 		knlist_add(&mq->mq_wsel.si_note, kn, 0);
2654 	} else
2655 		error = EINVAL;
2656 	return (error);
2657 }
2658 
2659 static void
2660 filt_mqdetach(struct knote *kn)
2661 {
2662 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
2663 
2664 	if (kn->kn_filter == EVFILT_READ)
2665 		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
2666 	else if (kn->kn_filter == EVFILT_WRITE)
2667 		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
2668 	else
2669 		panic("filt_mqdetach");
2670 }
2671 
2672 static int
2673 filt_mqread(struct knote *kn, long hint)
2674 {
2675 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
2676 
2677 	mtx_assert(&mq->mq_mutex, MA_OWNED);
2678 	return (mq->mq_curmsgs != 0);
2679 }
2680 
2681 static int
2682 filt_mqwrite(struct knote *kn, long hint)
2683 {
2684 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
2685 
2686 	mtx_assert(&mq->mq_mutex, MA_OWNED);
2687 	return (mq->mq_curmsgs < mq->mq_maxmsg);
2688 }
2689 
2690 static int
2691 mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2692 {
2693 
2694 	kif->kf_type = KF_TYPE_MQUEUE;
2695 	return (0);
2696 }
2697 
2698 static struct fileops mqueueops = {
2699 	.fo_read		= invfo_rdwr,
2700 	.fo_write		= invfo_rdwr,
2701 	.fo_truncate		= invfo_truncate,
2702 	.fo_ioctl		= invfo_ioctl,
2703 	.fo_poll		= mqf_poll,
2704 	.fo_kqfilter		= mqf_kqfilter,
2705 	.fo_stat		= mqf_stat,
2706 	.fo_close		= mqf_close,
2707 	.fo_chmod		= mqf_chmod,
2708 	.fo_chown		= mqf_chown,
2709 	.fo_sendfile		= invfo_sendfile,
2710 	.fo_fill_kinfo		= mqf_fill_kinfo,
2711 };
2712 
2713 static struct vop_vector mqfs_vnodeops = {
2714 	.vop_default 		= &default_vnodeops,
2715 	.vop_access		= mqfs_access,
2716 	.vop_cachedlookup	= mqfs_lookup,
2717 	.vop_lookup		= vfs_cache_lookup,
2718 	.vop_reclaim		= mqfs_reclaim,
2719 	.vop_create		= mqfs_create,
2720 	.vop_remove		= mqfs_remove,
2721 	.vop_inactive		= mqfs_inactive,
2722 	.vop_open		= mqfs_open,
2723 	.vop_close		= mqfs_close,
2724 	.vop_getattr		= mqfs_getattr,
2725 	.vop_setattr		= mqfs_setattr,
2726 	.vop_read		= mqfs_read,
2727 	.vop_write		= VOP_EOPNOTSUPP,
2728 	.vop_readdir		= mqfs_readdir,
2729 	.vop_mkdir		= VOP_EOPNOTSUPP,
2730 	.vop_rmdir		= VOP_EOPNOTSUPP
2731 };
2732 
2733 static struct vfsops mqfs_vfsops = {
2734 	.vfs_init 		= mqfs_init,
2735 	.vfs_uninit		= mqfs_uninit,
2736 	.vfs_mount		= mqfs_mount,
2737 	.vfs_unmount		= mqfs_unmount,
2738 	.vfs_root		= mqfs_root,
2739 	.vfs_statfs		= mqfs_statfs,
2740 };
2741 
2742 static struct vfsconf mqueuefs_vfsconf = {
2743 	.vfc_version = VFS_VERSION,
2744 	.vfc_name = "mqueuefs",
2745 	.vfc_vfsops = &mqfs_vfsops,
2746 	.vfc_typenum = -1,
2747 	.vfc_flags = VFCF_SYNTHETIC
2748 };
2749 
2750 static struct syscall_helper_data mq_syscalls[] = {
2751 	SYSCALL_INIT_HELPER(kmq_open),
2752 	SYSCALL_INIT_HELPER(kmq_setattr),
2753 	SYSCALL_INIT_HELPER(kmq_timedsend),
2754 	SYSCALL_INIT_HELPER(kmq_timedreceive),
2755 	SYSCALL_INIT_HELPER(kmq_notify),
2756 	SYSCALL_INIT_HELPER(kmq_unlink),
2757 	SYSCALL_INIT_LAST
2758 };
2759 
2760 #ifdef COMPAT_FREEBSD32
2761 #include <compat/freebsd32/freebsd32.h>
2762 #include <compat/freebsd32/freebsd32_proto.h>
2763 #include <compat/freebsd32/freebsd32_signal.h>
2764 #include <compat/freebsd32/freebsd32_syscall.h>
2765 #include <compat/freebsd32/freebsd32_util.h>
2766 
2767 static void
2768 mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
2769 {
2770 
2771 	to->mq_flags = from->mq_flags;
2772 	to->mq_maxmsg = from->mq_maxmsg;
2773 	to->mq_msgsize = from->mq_msgsize;
2774 	to->mq_curmsgs = from->mq_curmsgs;
2775 }
2776 
2777 static void
2778 mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
2779 {
2780 
2781 	to->mq_flags = from->mq_flags;
2782 	to->mq_maxmsg = from->mq_maxmsg;
2783 	to->mq_msgsize = from->mq_msgsize;
2784 	to->mq_curmsgs = from->mq_curmsgs;
2785 }
2786 
2787 int
2788 freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
2789 {
2790 	struct mq_attr attr;
2791 	struct mq_attr32 attr32;
2792 	int flags, error;
2793 
2794 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
2795 		return (EINVAL);
2796 	flags = FFLAGS(uap->flags);
2797 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
2798 		error = copyin(uap->attr, &attr32, sizeof(attr32));
2799 		if (error)
2800 			return (error);
2801 		mq_attr_from32(&attr32, &attr);
2802 	}
2803 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
2804 	    uap->attr != NULL ? &attr : NULL));
2805 }
2806 
2807 int
2808 freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
2809 {
2810 	struct mq_attr attr, oattr;
2811 	struct mq_attr32 attr32, oattr32;
2812 	int error;
2813 
2814 	if (uap->attr != NULL) {
2815 		error = copyin(uap->attr, &attr32, sizeof(attr32));
2816 		if (error != 0)
2817 			return (error);
2818 		mq_attr_from32(&attr32, &attr);
2819 	}
2820 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
2821 	    &oattr);
2822 	if (error != 0)
2823 		return (error);
2824 	if (uap->oattr != NULL) {
2825 		mq_attr_to32(&oattr, &oattr32);
2826 		error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
2827 	}
2828 	return (error);
2829 }
2830 
2831 int
2832 freebsd32_kmq_timedsend(struct thread *td,
2833     struct freebsd32_kmq_timedsend_args *uap)
2834 {
2835 	struct mqueue *mq;
2836 	struct file *fp;
2837 	struct timespec32 ets32;
2838 	struct timespec *abs_timeout, ets;
2839 	int error;
2840 	int waitok;
2841 
2842 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
2843 	if (error)
2844 		return (error);
2845 	if (uap->abs_timeout != NULL) {
2846 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
2847 		if (error != 0)
2848 			return (error);
2849 		CP(ets32, ets, tv_sec);
2850 		CP(ets32, ets, tv_nsec);
2851 		abs_timeout = &ets;
2852 	} else
2853 		abs_timeout = NULL;
2854 	waitok = !(fp->f_flag & O_NONBLOCK);
2855 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
2856 		uap->msg_prio, waitok, abs_timeout);
2857 	fdrop(fp, td);
2858 	return (error);
2859 }
2860 
2861 int
2862 freebsd32_kmq_timedreceive(struct thread *td,
2863     struct freebsd32_kmq_timedreceive_args *uap)
2864 {
2865 	struct mqueue *mq;
2866 	struct file *fp;
2867 	struct timespec32 ets32;
2868 	struct timespec *abs_timeout, ets;
2869 	int error, waitok;
2870 
2871 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
2872 	if (error)
2873 		return (error);
2874 	if (uap->abs_timeout != NULL) {
2875 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
2876 		if (error != 0)
2877 			return (error);
2878 		CP(ets32, ets, tv_sec);
2879 		CP(ets32, ets, tv_nsec);
2880 		abs_timeout = &ets;
2881 	} else
2882 		abs_timeout = NULL;
2883 	waitok = !(fp->f_flag & O_NONBLOCK);
2884 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
2885 		uap->msg_prio, waitok, abs_timeout);
2886 	fdrop(fp, td);
2887 	return (error);
2888 }
2889 
2890 int
2891 freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
2892 {
2893 	struct sigevent ev, *evp;
2894 	struct sigevent32 ev32;
2895 	int error;
2896 
2897 	if (uap->sigev == NULL) {
2898 		evp = NULL;
2899 	} else {
2900 		error = copyin(uap->sigev, &ev32, sizeof(ev32));
2901 		if (error != 0)
2902 			return (error);
2903 		error = convert_sigevent32(&ev32, &ev);
2904 		if (error != 0)
2905 			return (error);
2906 		evp = &ev;
2907 	}
2908 	return (kern_kmq_notify(td, uap->mqd, evp));
2909 }
2910 
2911 static struct syscall_helper_data mq32_syscalls[] = {
2912 	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
2913 	SYSCALL32_INIT_HELPER(freebsd32_kmq_setattr),
2914 	SYSCALL32_INIT_HELPER(freebsd32_kmq_timedsend),
2915 	SYSCALL32_INIT_HELPER(freebsd32_kmq_timedreceive),
2916 	SYSCALL32_INIT_HELPER(freebsd32_kmq_notify),
2917 	SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
2918 	SYSCALL_INIT_LAST
2919 };
2920 #endif
2921 
2922 static int
2923 mqinit(void)
2924 {
2925 	int error;
2926 
2927 	error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD);
2928 	if (error != 0)
2929 		return (error);
2930 #ifdef COMPAT_FREEBSD32
2931 	error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD);
2932 	if (error != 0)
2933 		return (error);
2934 #endif
2935 	return (0);
2936 }
2937 
2938 static int
2939 mqunload(void)
2940 {
2941 
2942 #ifdef COMPAT_FREEBSD32
2943 	syscall32_helper_unregister(mq32_syscalls);
2944 #endif
2945 	syscall_helper_unregister(mq_syscalls);
2946 	return (0);
2947 }
2948 
2949 static int
2950 mq_modload(struct module *module, int cmd, void *arg)
2951 {
2952 	int error = 0;
2953 
2954 	error = vfs_modevent(module, cmd, arg);
2955 	if (error != 0)
2956 		return (error);
2957 
2958 	switch (cmd) {
2959 	case MOD_LOAD:
2960 		error = mqinit();
2961 		if (error != 0)
2962 			mqunload();
2963 		break;
2964 	case MOD_UNLOAD:
2965 		error = mqunload();
2966 		break;
2967 	default:
2968 		break;
2969 	}
2970 	return (error);
2971 }
2972 
2973 static moduledata_t mqueuefs_mod = {
2974 	"mqueuefs",
2975 	mq_modload,
2976 	&mqueuefs_vfsconf
2977 };
2978 DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
2979 MODULE_VERSION(mqueuefs, 1);
2980