xref: /freebsd/sys/kern/vfs_inotify.c (revision 328110da2661a8841f12000b99fea27ceacdd5b2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2025 Klara, Inc.
5  */
6 
7 #include "opt_ktrace.h"
8 
9 #include <sys/param.h>
10 #include <sys/systm.h>
11 #include <sys/caprights.h>
12 #include <sys/counter.h>
13 #include <sys/dirent.h>
14 #define	EXTERR_CATEGORY	EXTERR_CAT_INOTIFY
15 #include <sys/exterrvar.h>
16 #include <sys/fcntl.h>
17 #include <sys/file.h>
18 #include <sys/filio.h>
19 #include <sys/inotify.h>
20 #include <sys/kernel.h>
21 #include <sys/lock.h>
22 #include <sys/ktrace.h>
23 #include <sys/malloc.h>
24 #include <sys/mutex.h>
25 #include <sys/namei.h>
26 #include <sys/poll.h>
27 #include <sys/proc.h>
28 #include <sys/queue.h>
29 #include <sys/resourcevar.h>
30 #include <sys/selinfo.h>
31 #include <sys/stat.h>
32 #include <sys/syscallsubr.h>
33 #include <sys/sysctl.h>
34 #include <sys/sysent.h>
35 #include <sys/syslimits.h>
36 #include <sys/sysproto.h>
37 #include <sys/tree.h>
38 #include <sys/user.h>
39 #include <sys/vnode.h>
40 
41 uint32_t inotify_rename_cookie;
42 
43 static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
44     "inotify configuration");
45 
46 static int inotify_max_queued_events = 16384;
47 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
48     &inotify_max_queued_events, 0,
49     "Maximum number of events to queue on an inotify descriptor");
50 
51 static int inotify_max_user_instances = 256;
52 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
53     &inotify_max_user_instances, 0,
54     "Maximum number of inotify descriptors per user");
55 
56 static int inotify_max_user_watches;
57 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
58     &inotify_max_user_watches, 0,
59     "Maximum number of inotify watches per user");
60 
61 static int inotify_max_watches;
62 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
63     &inotify_max_watches, 0,
64     "Maximum number of inotify watches system-wide");
65 
66 static int inotify_watches;
67 SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
68     &inotify_watches, 0,
69     "Total number of inotify watches currently in use");
70 
71 static int inotify_coalesce = 1;
72 SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
73     &inotify_coalesce, 0,
74     "Coalesce inotify events when possible");
75 
76 static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
77 SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
78     &inotify_event_drops,
79     "Number of inotify events dropped due to limits or allocation failures");
80 
81 static fo_rdwr_t	inotify_read;
82 static fo_ioctl_t	inotify_ioctl;
83 static fo_poll_t	inotify_poll;
84 static fo_kqfilter_t	inotify_kqfilter;
85 static fo_stat_t	inotify_stat;
86 static fo_close_t	inotify_close;
87 static fo_fill_kinfo_t	inotify_fill_kinfo;
88 
89 static const struct fileops inotifyfdops = {
90 	.fo_read = inotify_read,
91 	.fo_write = invfo_rdwr,
92 	.fo_truncate = invfo_truncate,
93 	.fo_ioctl = inotify_ioctl,
94 	.fo_poll = inotify_poll,
95 	.fo_kqfilter = inotify_kqfilter,
96 	.fo_stat = inotify_stat,
97 	.fo_close = inotify_close,
98 	.fo_chmod = invfo_chmod,
99 	.fo_chown = invfo_chown,
100 	.fo_sendfile = invfo_sendfile,
101 	.fo_fill_kinfo = inotify_fill_kinfo,
102 	.fo_cmp = file_kcmp_generic,
103 	.fo_flags = DFLAG_PASSABLE,
104 };
105 
106 static void	filt_inotifydetach(struct knote *kn);
107 static int	filt_inotifyevent(struct knote *kn, long hint);
108 
109 static const struct filterops inotify_rfiltops = {
110 	.f_isfd = 1,
111 	.f_detach = filt_inotifydetach,
112 	.f_event = filt_inotifyevent,
113 };
114 
115 static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
116 
117 struct inotify_record {
118 	STAILQ_ENTRY(inotify_record) link;
119 	struct inotify_event	ev;
120 };
121 
122 static uint64_t inotify_ino = 1;
123 
124 /*
125  * On LP64 systems this occupies 64 bytes, so we don't get internal
126  * fragmentation by allocating watches with malloc(9).  If the size changes,
127  * consider using a UMA zone to improve memory efficiency.
128  */
129 struct inotify_watch {
130 	struct inotify_softc *sc; /* back-pointer */
131 	int		wd;	/* unique ID */
132 	uint32_t	mask;	/* event mask */
133 	struct vnode	*vp;	/* vnode being watched, refed */
134 	RB_ENTRY(inotify_watch) ilink;		/* inotify linkage */
135 	TAILQ_ENTRY(inotify_watch) vlink;	/* vnode linkage */
136 };
137 
138 static void
139 inotify_init(void *arg __unused)
140 {
141 	/* Don't let a user hold too many vnodes. */
142 	inotify_max_user_watches = desiredvnodes / 3;
143 	/* Don't let the system hold too many vnodes. */
144 	inotify_max_watches = desiredvnodes / 2;
145 }
146 SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
147 
148 static int
149 inotify_watch_cmp(const struct inotify_watch *a,
150     const struct inotify_watch *b)
151 {
152 	if (a->wd < b->wd)
153 		return (-1);
154 	else if (a->wd > b->wd)
155 		return (1);
156 	else
157 		return (0);
158 }
159 RB_HEAD(inotify_watch_tree, inotify_watch);
160 RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
161 
162 struct inotify_softc {
163 	struct mtx	lock;			/* serialize all softc writes */
164 	STAILQ_HEAD(, inotify_record) pending;	/* events waiting to be read */
165 	struct inotify_record overflow;		/* preallocated record */
166 	int		nextwatch;		/* next watch ID to try */
167 	int		npending;		/* number of pending events */
168 	size_t		nbpending;		/* bytes available to read */
169 	uint64_t	ino;			/* unique identifier */
170 	struct inotify_watch_tree watches;	/* active watches */
171 	struct selinfo	sel;			/* select/poll/kevent info */
172 	struct ucred	*cred;			/* credential ref */
173 };
174 
175 static struct inotify_record *
176 inotify_dequeue(struct inotify_softc *sc)
177 {
178 	struct inotify_record *rec;
179 
180 	mtx_assert(&sc->lock, MA_OWNED);
181 	KASSERT(!STAILQ_EMPTY(&sc->pending),
182 	    ("%s: queue for %p is empty", __func__, sc));
183 
184 	rec = STAILQ_FIRST(&sc->pending);
185 	STAILQ_REMOVE_HEAD(&sc->pending, link);
186 	sc->npending--;
187 	sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
188 	return (rec);
189 }
190 
191 static void
192 inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head)
193 {
194 	mtx_assert(&sc->lock, MA_OWNED);
195 
196 	if (head)
197 		STAILQ_INSERT_HEAD(&sc->pending, rec, link);
198 	else
199 		STAILQ_INSERT_TAIL(&sc->pending, rec, link);
200 	sc->npending++;
201 	sc->nbpending += sizeof(rec->ev) + rec->ev.len;
202 }
203 
204 static int
205 inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
206     struct thread *td)
207 {
208 	struct inotify_softc *sc;
209 	struct inotify_record *rec;
210 	int error;
211 	bool first;
212 
213 	sc = fp->f_data;
214 	error = 0;
215 
216 	mtx_lock(&sc->lock);
217 	while (STAILQ_EMPTY(&sc->pending)) {
218 		if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
219 			mtx_unlock(&sc->lock);
220 			return (EWOULDBLOCK);
221 		}
222 		error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
223 		if (error != 0) {
224 			mtx_unlock(&sc->lock);
225 			return (error);
226 		}
227 	}
228 	for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
229 		size_t len;
230 
231 		rec = inotify_dequeue(sc);
232 		len = sizeof(rec->ev) + rec->ev.len;
233 		if (uio->uio_resid < (ssize_t)len) {
234 			inotify_enqueue(sc, rec, true);
235 			if (first) {
236 				error = EXTERROR(EINVAL,
237 				    "read buffer is too small");
238 			}
239 			break;
240 		}
241 		mtx_unlock(&sc->lock);
242 		error = uiomove(&rec->ev, len, uio);
243 #ifdef KTRACE
244 		if (error == 0 && KTRPOINT(td, KTR_STRUCT))
245 			ktrstruct("inotify", &rec->ev, len);
246 #endif
247 		mtx_lock(&sc->lock);
248 		if (error != 0) {
249 			inotify_enqueue(sc, rec, true);
250 			mtx_unlock(&sc->lock);
251 			return (error);
252 		}
253 		if (rec == &sc->overflow) {
254 			/*
255 			 * Signal to inotify_queue_record() that the overflow
256 			 * record can be reused.
257 			 */
258 			memset(rec, 0, sizeof(*rec));
259 		} else {
260 			free(rec, M_INOTIFY);
261 		}
262 	}
263 	mtx_unlock(&sc->lock);
264 	return (error);
265 }
266 
267 static int
268 inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
269     struct thread *td)
270 {
271 	struct inotify_softc *sc;
272 
273 	sc = fp->f_data;
274 
275 	switch (com) {
276 	case FIONREAD:
277 		*(int *)data = (int)sc->nbpending;
278 		return (0);
279 	case FIONBIO:
280 	case FIOASYNC:
281 		return (0);
282 	default:
283 		return (ENOTTY);
284 	}
285 
286 	return (0);
287 }
288 
289 static int
290 inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
291 {
292 	struct inotify_softc *sc;
293 	int revents;
294 
295 	sc = fp->f_data;
296 	revents = 0;
297 
298 	mtx_lock(&sc->lock);
299 	if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
300 		revents |= events & (POLLIN | POLLRDNORM);
301 	else
302 		selrecord(td, &sc->sel);
303 	mtx_unlock(&sc->lock);
304 	return (revents);
305 }
306 
307 static void
308 filt_inotifydetach(struct knote *kn)
309 {
310 	struct inotify_softc *sc;
311 
312 	sc = kn->kn_hook;
313 	knlist_remove(&sc->sel.si_note, kn, 0);
314 }
315 
316 static int
317 filt_inotifyevent(struct knote *kn, long hint)
318 {
319 	struct inotify_softc *sc;
320 
321 	sc = kn->kn_hook;
322 	mtx_assert(&sc->lock, MA_OWNED);
323 	kn->kn_data = sc->nbpending;
324 	return (kn->kn_data > 0);
325 }
326 
327 static int
328 inotify_kqfilter(struct file *fp, struct knote *kn)
329 {
330 	struct inotify_softc *sc;
331 
332 	if (kn->kn_filter != EVFILT_READ)
333 		return (EINVAL);
334 	sc = fp->f_data;
335 	kn->kn_fop = &inotify_rfiltops;
336 	kn->kn_hook = sc;
337 	knlist_add(&sc->sel.si_note, kn, 0);
338 	return (0);
339 }
340 
341 static int
342 inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
343 {
344 	struct inotify_softc *sc;
345 
346 	sc = fp->f_data;
347 
348 	memset(sb, 0, sizeof(*sb));
349 	sb->st_mode = S_IFREG | S_IRUSR;
350 	sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
351 	mtx_lock(&sc->lock);
352 	sb->st_size = sc->nbpending;
353 	sb->st_blocks = sc->npending;
354 	sb->st_uid = sc->cred->cr_ruid;
355 	sb->st_gid = sc->cred->cr_rgid;
356 	sb->st_ino = sc->ino;
357 	mtx_unlock(&sc->lock);
358 	return (0);
359 }
360 
361 static void
362 inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch)
363 {
364 	struct vnode *vp;
365 
366 	vp = watch->vp;
367 	mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
368 
369 	atomic_subtract_int(&inotify_watches, 1);
370 	(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
371 
372 	TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
373 	if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
374 		vn_irflag_unset(vp, VIRF_INOTIFY);
375 }
376 
377 /*
378  * Assumes that the watch has already been removed from its softc.
379  */
380 static void
381 inotify_remove_watch(struct inotify_watch *watch)
382 {
383 	struct inotify_softc *sc;
384 	struct vnode *vp;
385 
386 	sc = watch->sc;
387 
388 	vp = watch->vp;
389 	mtx_lock(&vp->v_pollinfo->vpi_lock);
390 	inotify_unlink_watch_locked(sc, watch);
391 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
392 
393 	vrele(vp);
394 	free(watch, M_INOTIFY);
395 }
396 
397 static int
398 inotify_close(struct file *fp, struct thread *td)
399 {
400 	struct inotify_softc *sc;
401 	struct inotify_record *rec;
402 	struct inotify_watch *watch;
403 
404 	sc = fp->f_data;
405 
406 	mtx_lock(&sc->lock);
407 	(void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
408 	while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
409 		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
410 		mtx_unlock(&sc->lock);
411 		inotify_remove_watch(watch);
412 		mtx_lock(&sc->lock);
413 	}
414 	while (!STAILQ_EMPTY(&sc->pending)) {
415 		rec = inotify_dequeue(sc);
416 		if (rec != &sc->overflow)
417 			free(rec, M_INOTIFY);
418 	}
419 	mtx_unlock(&sc->lock);
420 	seldrain(&sc->sel);
421 	knlist_destroy(&sc->sel.si_note);
422 	mtx_destroy(&sc->lock);
423 	crfree(sc->cred);
424 	free(sc, M_INOTIFY);
425 	return (0);
426 }
427 
428 static int
429 inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
430     struct filedesc *fdp)
431 {
432 	struct inotify_softc *sc;
433 
434 	sc = fp->f_data;
435 
436 	kif->kf_type = KF_TYPE_INOTIFY;
437 	kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
438 	kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
439 	return (0);
440 }
441 
442 int
443 inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp)
444 {
445 	struct inotify_softc *sc;
446 	int fflags;
447 
448 	if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
449 		return (EINVAL);
450 
451 	if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
452 	    inotify_max_user_instances))
453 		return (EMFILE);
454 
455 	sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
456 	sc->nextwatch = 1; /* Required for compatibility. */
457 	STAILQ_INIT(&sc->pending);
458 	RB_INIT(&sc->watches);
459 	mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
460 	knlist_init_mtx(&sc->sel.si_note, &sc->lock);
461 	sc->cred = crhold(td->td_ucred);
462 	sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
463 
464 	fflags = FREAD;
465 	if ((flags & IN_NONBLOCK) != 0)
466 		fflags |= FNONBLOCK;
467 	if ((flags & IN_CLOEXEC) != 0)
468 		*fflagsp |= O_CLOEXEC;
469 	finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
470 
471 	return (0);
472 }
473 
474 static struct inotify_record *
475 inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
476     uint32_t cookie, int waitok)
477 {
478 	struct inotify_event *evp;
479 	struct inotify_record *rec;
480 
481 	rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
482 	    waitok | M_ZERO);
483 	if (rec == NULL)
484 		return (NULL);
485 	evp = &rec->ev;
486 	evp->wd = wd;
487 	evp->mask = event;
488 	evp->cookie = cookie;
489 	evp->len = _IN_NAMESIZE(namelen);
490 	if (name != NULL)
491 		memcpy(evp->name, name, namelen);
492 	return (rec);
493 }
494 
495 static bool
496 inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp)
497 {
498 	struct inotify_record *prev;
499 
500 	mtx_assert(&sc->lock, MA_OWNED);
501 
502 	prev = STAILQ_LAST(&sc->pending, inotify_record, link);
503 	return (prev != NULL && prev->ev.mask == evp->mask &&
504 	    prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie &&
505 	    prev->ev.len == evp->len &&
506 	    memcmp(prev->ev.name, evp->name, evp->len) == 0);
507 }
508 
509 static void
510 inotify_overflow_event(struct inotify_event *evp)
511 {
512 	evp->mask = IN_Q_OVERFLOW;
513 	evp->wd = -1;
514 	evp->cookie = 0;
515 	evp->len = 0;
516 }
517 
518 /*
519  * Put an event record on the queue for an inotify desscriptor.  Return false if
520  * the record was not enqueued for some reason, true otherwise.
521  */
522 static bool
523 inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec)
524 {
525 	struct inotify_event *evp;
526 
527 	mtx_assert(&sc->lock, MA_OWNED);
528 
529 	evp = &rec->ev;
530 	if (__predict_false(rec == &sc->overflow)) {
531 		/*
532 		 * Is the overflow record already in the queue?  If so, there's
533 		 * not much else we can do: we're here because a kernel memory
534 		 * shortage prevented new record allocations.
535 		 */
536 		counter_u64_add(inotify_event_drops, 1);
537 		if (evp->mask == IN_Q_OVERFLOW)
538 			return (false);
539 		inotify_overflow_event(evp);
540 	} else {
541 		/* Try to coalesce duplicate events. */
542 		if (inotify_coalesce && inotify_can_coalesce(sc, evp))
543 			return (false);
544 
545 		/*
546 		 * Would this one overflow the queue?  If so, convert it to an
547 		 * overflow event and try again to coalesce.
548 		 */
549 		if (sc->npending >= inotify_max_queued_events) {
550 			counter_u64_add(inotify_event_drops, 1);
551 			inotify_overflow_event(evp);
552 			if (inotify_can_coalesce(sc, evp))
553 				return (false);
554 		}
555 	}
556 	inotify_enqueue(sc, rec, false);
557 	selwakeup(&sc->sel);
558 	KNOTE_LOCKED(&sc->sel.si_note, 0);
559 	wakeup(&sc->pending);
560 	return (true);
561 }
562 
563 static int
564 inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen,
565     int event, uint32_t cookie)
566 {
567 	struct inotify_watch key;
568 	struct inotify_softc *sc;
569 	struct inotify_record *rec;
570 	int relecount;
571 	bool allocfail;
572 
573 	relecount = 0;
574 
575 	sc = watch->sc;
576 	rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
577 	    M_NOWAIT);
578 	if (rec == NULL) {
579 		rec = &sc->overflow;
580 		allocfail = true;
581 	} else {
582 		allocfail = false;
583 	}
584 
585 	mtx_lock(&sc->lock);
586 	if (!inotify_queue_record(sc, rec) && rec != &sc->overflow)
587 		free(rec, M_INOTIFY);
588 	if ((watch->mask & IN_ONESHOT) != 0 ||
589 	    (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) {
590 		if (!allocfail) {
591 			rec = inotify_alloc_record(watch->wd, NULL, 0,
592 			    IN_IGNORED, 0, M_NOWAIT);
593 			if (rec == NULL)
594 				rec = &sc->overflow;
595 			if (!inotify_queue_record(sc, rec) &&
596 			    rec != &sc->overflow)
597 				free(rec, M_INOTIFY);
598 		}
599 
600 		/*
601 		 * Remove the watch, taking care to handle races with
602 		 * inotify_close().
603 		 */
604 		key.wd = watch->wd;
605 		if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
606 			RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
607 			inotify_unlink_watch_locked(sc, watch);
608 			free(watch, M_INOTIFY);
609 
610 			/* Defer vrele() to until locks are dropped. */
611 			relecount++;
612 		}
613 	}
614 	mtx_unlock(&sc->lock);
615 	return (relecount);
616 }
617 
618 void
619 inotify_log(struct vnode *vp, const char *name, size_t namelen, int event,
620     uint32_t cookie)
621 {
622 	struct inotify_watch *watch, *tmp;
623 	int relecount;
624 
625 	KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0,
626 	    ("inotify_log: invalid event %#x", event));
627 
628 	relecount = 0;
629 	mtx_lock(&vp->v_pollinfo->vpi_lock);
630 	TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
631 		KASSERT(watch->vp == vp,
632 		    ("inotify_log: watch %p vp != vp", watch));
633 		if ((watch->mask & event) != 0 || event == IN_UNMOUNT) {
634 			relecount += inotify_log_one(watch, name, namelen, event,
635 			    cookie);
636 		}
637 	}
638 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
639 
640 	for (int i = 0; i < relecount; i++)
641 		vrele(vp);
642 }
643 
644 /*
645  * An inotify event occurred on a watched vnode.
646  */
647 void
648 vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
649     int event, uint32_t cookie)
650 {
651 	int isdir;
652 
653 	VNPASS(vp->v_holdcnt > 0, vp);
654 
655 	isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
656 
657 	if (dvp != NULL) {
658 		VNPASS(dvp->v_holdcnt > 0, dvp);
659 
660 		/*
661 		 * Should we log an event for the vnode itself?
662 		 */
663 		if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
664 			int selfevent;
665 
666 			switch (event) {
667 			case _IN_MOVE_DELETE:
668 			case IN_DELETE:
669 				/*
670 				 * IN_DELETE_SELF is only generated when the
671 				 * last hard link of a file is removed.
672 				 */
673 				selfevent = IN_DELETE_SELF;
674 				if (vp->v_type != VDIR) {
675 					struct vattr va;
676 					int error;
677 
678 					error = VOP_GETATTR(vp, &va,
679 					    cnp->cn_cred);
680 					if (error == 0 && va.va_nlink != 0)
681 						selfevent = 0;
682 				}
683 				break;
684 			case IN_MOVED_FROM:
685 				cookie = 0;
686 				selfevent = IN_MOVE_SELF;
687 				break;
688 			case _IN_ATTRIB_LINKCOUNT:
689 				selfevent = IN_ATTRIB;
690 				break;
691 			default:
692 				selfevent = event;
693 				break;
694 			}
695 
696 			if ((selfevent & ~_IN_DIR_EVENTS) != 0) {
697 				inotify_log(vp, NULL, 0, selfevent | isdir,
698 				    cookie);
699 			}
700 		}
701 
702 		/*
703 		 * Something is watching the directory through which this vnode
704 		 * was referenced, so we may need to log the event.
705 		 */
706 		if ((event & IN_ALL_EVENTS) != 0 &&
707 		    (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
708 			inotify_log(dvp, cnp->cn_nameptr,
709 			    cnp->cn_namelen, event | isdir, cookie);
710 		}
711 	} else {
712 		/*
713 		 * We don't know which watched directory might contain the
714 		 * vnode, so we have to fall back to searching the name cache.
715 		 */
716 		cache_vop_inotify(vp, event, cookie);
717 	}
718 }
719 
720 int
721 vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
722     uint32_t *wdp, struct thread *td)
723 {
724 	struct inotify_watch *watch, *watch1;
725 	uint32_t wd;
726 
727 	/*
728 	 * If this is a directory, make sure all of its entries are present in
729 	 * the name cache so that we're able to look them up if an event occurs.
730 	 * The persistent reference on the directory prevents the outgoing name
731 	 * cache entries from being reclaimed.
732 	 */
733 	if (vp->v_type == VDIR) {
734 		struct dirent *dp;
735 		char *buf;
736 		off_t off;
737 		size_t buflen, len;
738 		int eof, error;
739 
740 		buflen = 128 * sizeof(struct dirent);
741 		buf = malloc(buflen, M_TEMP, M_WAITOK);
742 
743 		error = 0;
744 		len = off = eof = 0;
745 		for (;;) {
746 			struct nameidata nd;
747 
748 			error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
749 			    &len, &off, &eof);
750 			if (error != 0)
751 				break;
752 			if (len == 0)
753 				/* Finished reading. */
754 				break;
755 			if (strcmp(dp->d_name, ".") == 0 ||
756 			    strcmp(dp->d_name, "..") == 0)
757 				continue;
758 
759 			/*
760 			 * namei() consumes a reference on the starting
761 			 * directory if it's specified as a vnode.
762 			 */
763 			vrefact(vp);
764 			VOP_UNLOCK(vp);
765 			NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
766 			    dp->d_name, vp);
767 			error = namei(&nd);
768 			vn_lock(vp, LK_SHARED | LK_RETRY);
769 			if (error != 0)
770 				break;
771 			vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
772 			vrele(nd.ni_vp);
773 		}
774 		free(buf, M_TEMP);
775 		if (error != 0)
776 			return (error);
777 	}
778 
779 	/*
780 	 * The vnode referenced in kern_inotify_add_watch() might be different
781 	 * than this one if nullfs is in the picture.
782 	 */
783 	vrefact(vp);
784 	watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO);
785 	watch->sc = sc;
786 	watch->vp = vp;
787 	watch->mask = mask;
788 
789 	/*
790 	 * Are we updating an existing watch?  Search the vnode's list rather
791 	 * than that of the softc, as the former is likely to be shorter.
792 	 */
793 	v_addpollinfo(vp);
794 	mtx_lock(&vp->v_pollinfo->vpi_lock);
795 	TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
796 		if (watch1->sc == sc)
797 			break;
798 	}
799 	mtx_lock(&sc->lock);
800 	if (watch1 != NULL) {
801 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
802 
803 		/*
804 		 * We found an existing watch, update it based on our flags.
805 		 */
806 		if ((mask & IN_MASK_CREATE) != 0) {
807 			mtx_unlock(&sc->lock);
808 			vrele(vp);
809 			free(watch, M_INOTIFY);
810 			return (EEXIST);
811 		}
812 		if ((mask & IN_MASK_ADD) != 0)
813 			watch1->mask |= mask;
814 		else
815 			watch1->mask = mask;
816 		*wdp = watch1->wd;
817 		mtx_unlock(&sc->lock);
818 		vrele(vp);
819 		free(watch, M_INOTIFY);
820 		return (EJUSTRETURN);
821 	}
822 
823 	/*
824 	 * We're creating a new watch.  Add it to the softc and vnode watch
825 	 * lists.
826 	 */
827 	do {
828 		struct inotify_watch key;
829 
830 		/*
831 		 * Search for the next available watch descriptor.  This is
832 		 * implemented so as to avoid reusing watch descriptors for as
833 		 * long as possible.
834 		 */
835 		key.wd = wd = sc->nextwatch++;
836 		watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
837 	} while (watch1 != NULL || wd == 0);
838 	watch->wd = wd;
839 	RB_INSERT(inotify_watch_tree, &sc->watches, watch);
840 	TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
841 	mtx_unlock(&sc->lock);
842 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
843 	vn_irflag_set_cond(vp, VIRF_INOTIFY);
844 
845 	*wdp = wd;
846 
847 	return (0);
848 }
849 
850 void
851 vn_inotify_revoke(struct vnode *vp)
852 {
853 	if (vp->v_pollinfo == NULL) {
854 		/* This is a nullfs vnode which shadows a watched vnode. */
855 		return;
856 	}
857 	inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
858 }
859 
860 static int
861 fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp,
862     struct file **fpp)
863 {
864 	struct file *fp;
865 	int error;
866 
867 	error = fget(td, fd, needrightsp, &fp);
868 	if (error != 0)
869 		return (error);
870 	if (fp->f_type != DTYPE_INOTIFY) {
871 		fdrop(fp, td);
872 		return (EINVAL);
873 	}
874 	*fpp = fp;
875 	return (0);
876 }
877 
878 int
879 kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
880     struct thread *td)
881 {
882 	struct nameidata nd;
883 	struct file *fp;
884 	struct inotify_softc *sc;
885 	struct vnode *vp;
886 	uint32_t wd;
887 	int count, error;
888 
889 	fp = NULL;
890 	vp = NULL;
891 
892 	if ((mask & IN_ALL_EVENTS) == 0)
893 		return (EXTERROR(EINVAL, "no events specified"));
894 	if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) ==
895 	    (IN_MASK_ADD | IN_MASK_CREATE))
896 		return (EXTERROR(EINVAL,
897 		    "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"));
898 	if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0)
899 		return (EXTERROR(EINVAL, "unrecognized flag"));
900 
901 	error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
902 	if (error != 0)
903 		return (error);
904 	sc = fp->f_data;
905 
906 	NDINIT_AT(&nd, LOOKUP,
907 	    ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF |
908 	    LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd);
909 	error = namei(&nd);
910 	if (error != 0)
911 		goto out;
912 	NDFREE_PNBUF(&nd);
913 	vp = nd.ni_vp;
914 
915 	error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
916 	if (error != 0)
917 		goto out;
918 
919 	if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
920 		error = ENOTDIR;
921 		goto out;
922 	}
923 
924 	count = atomic_fetchadd_int(&inotify_watches, 1);
925 	if (count > inotify_max_watches) {
926 		atomic_subtract_int(&inotify_watches, 1);
927 		error = ENOSPC;
928 		goto out;
929 	}
930 	if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
931 	    inotify_max_user_watches)) {
932 		atomic_subtract_int(&inotify_watches, 1);
933 		error = ENOSPC;
934 		goto out;
935 	}
936 	error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
937 	if (error != 0) {
938 		atomic_subtract_int(&inotify_watches, 1);
939 		(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
940 		if (error == EJUSTRETURN) {
941 			/* We updated an existing watch, everything is ok. */
942 			error = 0;
943 		} else {
944 			goto out;
945 		}
946 	}
947 	td->td_retval[0] = wd;
948 
949 out:
950 	if (vp != NULL)
951 		vput(vp);
952 	fdrop(fp, td);
953 	return (error);
954 }
955 
956 int
957 sys_inotify_add_watch_at(struct thread *td,
958     struct inotify_add_watch_at_args *uap)
959 {
960 	return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
961 	    uap->mask, td));
962 }
963 
964 int
965 kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
966 {
967 	struct file *fp;
968 	struct inotify_softc *sc;
969 	struct inotify_record *rec;
970 	struct inotify_watch key, *watch;
971 	int error;
972 
973 	error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
974 	if (error != 0)
975 		return (error);
976 	sc = fp->f_data;
977 
978 	rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
979 
980 	/*
981 	 * For compatibility with Linux, we do not remove pending events
982 	 * associated with the watch.  Watch descriptors are implemented so as
983 	 * to avoid being reused for as long as possible, so one hopes that any
984 	 * pending events from the removed watch descriptor will be removed
985 	 * before the watch descriptor is recycled.
986 	 */
987 	key.wd = wd;
988 	mtx_lock(&sc->lock);
989 	watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
990 	if (watch == NULL) {
991 		free(rec, M_INOTIFY);
992 		error = EINVAL;
993 	} else {
994 		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
995 		if (!inotify_queue_record(sc, rec)) {
996 			free(rec, M_INOTIFY);
997 			error = 0;
998 		}
999 	}
1000 	mtx_unlock(&sc->lock);
1001 	if (watch != NULL)
1002 		inotify_remove_watch(watch);
1003 	fdrop(fp, td);
1004 	return (error);
1005 }
1006 
1007 int
1008 sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap)
1009 {
1010 	return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
1011 }
1012