xref: /freebsd/sys/kern/vfs_inotify.c (revision 12bef37a824c52582ee8f38699b8ae4fde17068d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2025 Klara, Inc.
5  */
6 
7 #include "opt_ktrace.h"
8 
9 #include <sys/param.h>
10 #include <sys/systm.h>
11 #include <sys/caprights.h>
12 #include <sys/counter.h>
13 #include <sys/dirent.h>
14 #define	EXTERR_CATEGORY	EXTERR_CAT_INOTIFY
15 #include <sys/exterrvar.h>
16 #include <sys/fcntl.h>
17 #include <sys/file.h>
18 #include <sys/filio.h>
19 #include <sys/inotify.h>
20 #include <sys/kernel.h>
21 #include <sys/lock.h>
22 #include <sys/ktrace.h>
23 #include <sys/malloc.h>
24 #include <sys/mutex.h>
25 #include <sys/namei.h>
26 #include <sys/poll.h>
27 #include <sys/proc.h>
28 #include <sys/queue.h>
29 #include <sys/resourcevar.h>
30 #include <sys/selinfo.h>
31 #include <sys/stat.h>
32 #include <sys/syscallsubr.h>
33 #include <sys/sysctl.h>
34 #include <sys/sysent.h>
35 #include <sys/syslimits.h>
36 #include <sys/sysproto.h>
37 #include <sys/tree.h>
38 #include <sys/user.h>
39 #include <sys/vnode.h>
40 
41 uint32_t inotify_rename_cookie;
42 
43 static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
44     "inotify configuration");
45 
46 static int inotify_max_queued_events = 16384;
47 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
48     &inotify_max_queued_events, 0,
49     "Maximum number of events to queue on an inotify descriptor");
50 
51 static int inotify_max_user_instances = 256;
52 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
53     &inotify_max_user_instances, 0,
54     "Maximum number of inotify descriptors per user");
55 
56 static int inotify_max_user_watches;
57 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
58     &inotify_max_user_watches, 0,
59     "Maximum number of inotify watches per user");
60 
61 static int inotify_max_watches;
62 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
63     &inotify_max_watches, 0,
64     "Maximum number of inotify watches system-wide");
65 
66 static int inotify_watches;
67 SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
68     &inotify_watches, 0,
69     "Total number of inotify watches currently in use");
70 
71 static int inotify_coalesce = 1;
72 SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
73     &inotify_coalesce, 0,
74     "Coalesce inotify events when possible");
75 
76 static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
77 SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
78     &inotify_event_drops,
79     "Number of inotify events dropped due to limits or allocation failures");
80 
81 static fo_rdwr_t	inotify_read;
82 static fo_ioctl_t	inotify_ioctl;
83 static fo_poll_t	inotify_poll;
84 static fo_kqfilter_t	inotify_kqfilter;
85 static fo_stat_t	inotify_stat;
86 static fo_close_t	inotify_close;
87 static fo_fill_kinfo_t	inotify_fill_kinfo;
88 
89 static const struct fileops inotifyfdops = {
90 	.fo_read = inotify_read,
91 	.fo_write = invfo_rdwr,
92 	.fo_truncate = invfo_truncate,
93 	.fo_ioctl = inotify_ioctl,
94 	.fo_poll = inotify_poll,
95 	.fo_kqfilter = inotify_kqfilter,
96 	.fo_stat = inotify_stat,
97 	.fo_close = inotify_close,
98 	.fo_chmod = invfo_chmod,
99 	.fo_chown = invfo_chown,
100 	.fo_sendfile = invfo_sendfile,
101 	.fo_fill_kinfo = inotify_fill_kinfo,
102 	.fo_cmp = file_kcmp_generic,
103 	.fo_flags = DFLAG_PASSABLE,
104 };
105 
106 static void	filt_inotifydetach(struct knote *kn);
107 static int	filt_inotifyevent(struct knote *kn, long hint);
108 
109 static const struct filterops inotify_rfiltops = {
110 	.f_isfd = 1,
111 	.f_detach = filt_inotifydetach,
112 	.f_event = filt_inotifyevent,
113 };
114 
115 static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
116 
117 struct inotify_record {
118 	STAILQ_ENTRY(inotify_record) link;
119 	struct inotify_event	ev;
120 };
121 
122 static uint64_t inotify_ino = 1;
123 
124 /*
125  * On LP64 systems this occupies 64 bytes, so we don't get internal
126  * fragmentation by allocating watches with malloc(9).  If the size changes,
127  * consider using a UMA zone to improve memory efficiency.
128  */
129 struct inotify_watch {
130 	struct inotify_softc *sc; /* back-pointer */
131 	int		wd;	/* unique ID */
132 	uint32_t	mask;	/* event mask */
133 	struct vnode	*vp;	/* vnode being watched, refed */
134 	RB_ENTRY(inotify_watch) ilink;		/* inotify linkage */
135 	TAILQ_ENTRY(inotify_watch) vlink;	/* vnode linkage */
136 };
137 
138 static void
139 inotify_init(void *arg __unused)
140 {
141 	/* Don't let a user hold too many vnodes. */
142 	inotify_max_user_watches = desiredvnodes / 3;
143 	/* Don't let the system hold too many vnodes. */
144 	inotify_max_watches = desiredvnodes / 2;
145 }
146 SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
147 
148 static int
149 inotify_watch_cmp(const struct inotify_watch *a,
150     const struct inotify_watch *b)
151 {
152 	if (a->wd < b->wd)
153 		return (-1);
154 	else if (a->wd > b->wd)
155 		return (1);
156 	else
157 		return (0);
158 }
159 RB_HEAD(inotify_watch_tree, inotify_watch);
160 RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
161 
162 struct inotify_softc {
163 	struct mtx	lock;			/* serialize all softc writes */
164 	STAILQ_HEAD(, inotify_record) pending;	/* events waiting to be read */
165 	struct inotify_record overflow;		/* preallocated record */
166 	int		nextwatch;		/* next watch ID to try */
167 	int		npending;		/* number of pending events */
168 	size_t		nbpending;		/* bytes available to read */
169 	uint64_t	ino;			/* unique identifier */
170 	struct inotify_watch_tree watches;	/* active watches */
171 	struct selinfo	sel;			/* select/poll/kevent info */
172 	struct ucred	*cred;			/* credential ref */
173 };
174 
175 static struct inotify_record *
176 inotify_dequeue(struct inotify_softc *sc)
177 {
178 	struct inotify_record *rec;
179 
180 	mtx_assert(&sc->lock, MA_OWNED);
181 	KASSERT(!STAILQ_EMPTY(&sc->pending),
182 	    ("%s: queue for %p is empty", __func__, sc));
183 
184 	rec = STAILQ_FIRST(&sc->pending);
185 	STAILQ_REMOVE_HEAD(&sc->pending, link);
186 	sc->npending--;
187 	sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
188 	return (rec);
189 }
190 
191 static void
192 inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head)
193 {
194 	mtx_assert(&sc->lock, MA_OWNED);
195 
196 	if (head)
197 		STAILQ_INSERT_HEAD(&sc->pending, rec, link);
198 	else
199 		STAILQ_INSERT_TAIL(&sc->pending, rec, link);
200 	sc->npending++;
201 	sc->nbpending += sizeof(rec->ev) + rec->ev.len;
202 }
203 
204 static int
205 inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
206     struct thread *td)
207 {
208 	struct inotify_softc *sc;
209 	struct inotify_record *rec;
210 	int error;
211 	bool first;
212 
213 	sc = fp->f_data;
214 	error = 0;
215 
216 	mtx_lock(&sc->lock);
217 	while (STAILQ_EMPTY(&sc->pending)) {
218 		if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
219 			mtx_unlock(&sc->lock);
220 			return (EWOULDBLOCK);
221 		}
222 		error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
223 		if (error != 0) {
224 			mtx_unlock(&sc->lock);
225 			return (error);
226 		}
227 	}
228 	for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
229 		size_t len;
230 
231 		rec = inotify_dequeue(sc);
232 		len = sizeof(rec->ev) + rec->ev.len;
233 		if (uio->uio_resid < (ssize_t)len) {
234 			inotify_enqueue(sc, rec, true);
235 			if (first) {
236 				error = EXTERROR(EINVAL,
237 				    "read buffer is too small");
238 			}
239 			break;
240 		}
241 		mtx_unlock(&sc->lock);
242 		error = uiomove(&rec->ev, len, uio);
243 #ifdef KTRACE
244 		if (error == 0 && KTRPOINT(td, KTR_STRUCT))
245 			ktrstruct("inotify", &rec->ev, len);
246 #endif
247 		mtx_lock(&sc->lock);
248 		if (error != 0) {
249 			inotify_enqueue(sc, rec, true);
250 			mtx_unlock(&sc->lock);
251 			return (error);
252 		}
253 		if (rec == &sc->overflow) {
254 			/*
255 			 * Signal to inotify_queue_record() that the overflow
256 			 * record can be reused.
257 			 */
258 			memset(rec, 0, sizeof(*rec));
259 		} else {
260 			free(rec, M_INOTIFY);
261 		}
262 	}
263 	mtx_unlock(&sc->lock);
264 	return (error);
265 }
266 
267 static int
268 inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
269     struct thread *td)
270 {
271 	struct inotify_softc *sc;
272 
273 	sc = fp->f_data;
274 
275 	switch (com) {
276 	case FIONREAD:
277 		*(int *)data = (int)sc->nbpending;
278 		return (0);
279 	case FIONBIO:
280 	case FIOASYNC:
281 		return (0);
282 	default:
283 		return (ENOTTY);
284 	}
285 
286 	return (0);
287 }
288 
289 static int
290 inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
291 {
292 	struct inotify_softc *sc;
293 	int revents;
294 
295 	sc = fp->f_data;
296 	revents = 0;
297 
298 	mtx_lock(&sc->lock);
299 	if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
300 		revents |= events & (POLLIN | POLLRDNORM);
301 	else
302 		selrecord(td, &sc->sel);
303 	mtx_unlock(&sc->lock);
304 	return (revents);
305 }
306 
307 static void
308 filt_inotifydetach(struct knote *kn)
309 {
310 	struct inotify_softc *sc;
311 
312 	sc = kn->kn_hook;
313 	knlist_remove(&sc->sel.si_note, kn, 0);
314 }
315 
316 static int
317 filt_inotifyevent(struct knote *kn, long hint)
318 {
319 	struct inotify_softc *sc;
320 
321 	sc = kn->kn_hook;
322 	mtx_assert(&sc->lock, MA_OWNED);
323 	kn->kn_data = sc->nbpending;
324 	return (kn->kn_data > 0);
325 }
326 
327 static int
328 inotify_kqfilter(struct file *fp, struct knote *kn)
329 {
330 	struct inotify_softc *sc;
331 
332 	if (kn->kn_filter != EVFILT_READ)
333 		return (EINVAL);
334 	sc = fp->f_data;
335 	kn->kn_fop = &inotify_rfiltops;
336 	kn->kn_hook = sc;
337 	knlist_add(&sc->sel.si_note, kn, 0);
338 	return (0);
339 }
340 
341 static int
342 inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
343 {
344 	struct inotify_softc *sc;
345 
346 	sc = fp->f_data;
347 
348 	memset(sb, 0, sizeof(*sb));
349 	sb->st_mode = S_IFREG | S_IRUSR;
350 	sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
351 	mtx_lock(&sc->lock);
352 	sb->st_size = sc->nbpending;
353 	sb->st_blocks = sc->npending;
354 	sb->st_uid = sc->cred->cr_ruid;
355 	sb->st_gid = sc->cred->cr_rgid;
356 	sb->st_ino = sc->ino;
357 	mtx_unlock(&sc->lock);
358 	return (0);
359 }
360 
361 static void
362 inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch)
363 {
364 	struct vnode *vp;
365 
366 	vp = watch->vp;
367 	mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
368 
369 	atomic_subtract_int(&inotify_watches, 1);
370 	(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
371 
372 	TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
373 	if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
374 		vn_irflag_unset_locked(vp, VIRF_INOTIFY);
375 }
376 
377 /*
378  * Assumes that the watch has already been removed from its softc.
379  */
380 static void
381 inotify_remove_watch(struct inotify_watch *watch)
382 {
383 	struct inotify_softc *sc;
384 	struct vnode *vp;
385 
386 	sc = watch->sc;
387 
388 	vp = watch->vp;
389 	mtx_lock(&vp->v_pollinfo->vpi_lock);
390 	inotify_unlink_watch_locked(sc, watch);
391 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
392 
393 	vrele(vp);
394 	free(watch, M_INOTIFY);
395 }
396 
397 static int
398 inotify_close(struct file *fp, struct thread *td)
399 {
400 	struct inotify_softc *sc;
401 	struct inotify_record *rec;
402 	struct inotify_watch *watch;
403 
404 	sc = fp->f_data;
405 
406 	mtx_lock(&sc->lock);
407 	(void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
408 	while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
409 		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
410 		mtx_unlock(&sc->lock);
411 		inotify_remove_watch(watch);
412 		mtx_lock(&sc->lock);
413 	}
414 	while (!STAILQ_EMPTY(&sc->pending)) {
415 		rec = inotify_dequeue(sc);
416 		if (rec != &sc->overflow)
417 			free(rec, M_INOTIFY);
418 	}
419 	mtx_unlock(&sc->lock);
420 	seldrain(&sc->sel);
421 	knlist_destroy(&sc->sel.si_note);
422 	mtx_destroy(&sc->lock);
423 	crfree(sc->cred);
424 	free(sc, M_INOTIFY);
425 	return (0);
426 }
427 
428 static int
429 inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
430     struct filedesc *fdp)
431 {
432 	struct inotify_softc *sc;
433 
434 	sc = fp->f_data;
435 
436 	kif->kf_type = KF_TYPE_INOTIFY;
437 	kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
438 	kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
439 	return (0);
440 }
441 
442 int
443 inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp)
444 {
445 	struct inotify_softc *sc;
446 	int fflags;
447 
448 	if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
449 		return (EINVAL);
450 
451 	if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
452 	    inotify_max_user_instances))
453 		return (EMFILE);
454 
455 	sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
456 	sc->nextwatch = 1; /* Required for compatibility. */
457 	STAILQ_INIT(&sc->pending);
458 	RB_INIT(&sc->watches);
459 	mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
460 	knlist_init_mtx(&sc->sel.si_note, &sc->lock);
461 	sc->cred = crhold(td->td_ucred);
462 	sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
463 
464 	fflags = FREAD;
465 	if ((flags & IN_NONBLOCK) != 0)
466 		fflags |= FNONBLOCK;
467 	if ((flags & IN_CLOEXEC) != 0)
468 		*fflagsp |= O_CLOEXEC;
469 	finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
470 
471 	return (0);
472 }
473 
474 static struct inotify_record *
475 inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
476     uint32_t cookie, int waitok)
477 {
478 	struct inotify_event *evp;
479 	struct inotify_record *rec;
480 
481 	rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
482 	    waitok | M_ZERO);
483 	if (rec == NULL)
484 		return (NULL);
485 	evp = &rec->ev;
486 	evp->wd = wd;
487 	evp->mask = event;
488 	evp->cookie = cookie;
489 	evp->len = _IN_NAMESIZE(namelen);
490 	if (name != NULL)
491 		memcpy(evp->name, name, namelen);
492 	return (rec);
493 }
494 
495 static bool
496 inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp)
497 {
498 	struct inotify_record *prev;
499 
500 	mtx_assert(&sc->lock, MA_OWNED);
501 
502 	prev = STAILQ_LAST(&sc->pending, inotify_record, link);
503 	return (prev != NULL && prev->ev.mask == evp->mask &&
504 	    prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie &&
505 	    prev->ev.len == evp->len &&
506 	    (evp->len == 0 || strcmp(prev->ev.name, evp->name) == 0));
507 }
508 
509 static void
510 inotify_overflow_event(struct inotify_event *evp)
511 {
512 	evp->mask = IN_Q_OVERFLOW;
513 	evp->wd = -1;
514 	evp->cookie = 0;
515 	evp->len = 0;
516 }
517 
518 /*
519  * Put an event record on the queue for an inotify desscriptor.  Return false if
520  * the record was not enqueued for some reason, true otherwise.
521  */
522 static bool
523 inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec)
524 {
525 	struct inotify_event *evp;
526 
527 	mtx_assert(&sc->lock, MA_OWNED);
528 
529 	evp = &rec->ev;
530 	if (__predict_false(rec == &sc->overflow)) {
531 		/*
532 		 * Is the overflow record already in the queue?  If so, there's
533 		 * not much else we can do: we're here because a kernel memory
534 		 * shortage prevented new record allocations.
535 		 */
536 		counter_u64_add(inotify_event_drops, 1);
537 		if (evp->mask == IN_Q_OVERFLOW)
538 			return (false);
539 		inotify_overflow_event(evp);
540 	} else {
541 		/* Try to coalesce duplicate events. */
542 		if (inotify_coalesce && inotify_can_coalesce(sc, evp))
543 			return (false);
544 
545 		/*
546 		 * Would this one overflow the queue?  If so, convert it to an
547 		 * overflow event and try again to coalesce.
548 		 */
549 		if (sc->npending >= inotify_max_queued_events) {
550 			counter_u64_add(inotify_event_drops, 1);
551 			inotify_overflow_event(evp);
552 			if (inotify_can_coalesce(sc, evp))
553 				return (false);
554 		}
555 	}
556 	inotify_enqueue(sc, rec, false);
557 	selwakeup(&sc->sel);
558 	KNOTE_LOCKED(&sc->sel.si_note, 0);
559 	wakeup(&sc->pending);
560 	return (true);
561 }
562 
563 static int
564 inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen,
565     int event, uint32_t cookie)
566 {
567 	struct inotify_watch key;
568 	struct inotify_softc *sc;
569 	struct inotify_record *rec;
570 	int relecount;
571 	bool allocfail;
572 
573 	relecount = 0;
574 
575 	sc = watch->sc;
576 	rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
577 	    M_NOWAIT);
578 	if (rec == NULL) {
579 		rec = &sc->overflow;
580 		allocfail = true;
581 	} else {
582 		allocfail = false;
583 	}
584 
585 	mtx_lock(&sc->lock);
586 	if (!inotify_queue_record(sc, rec) && rec != &sc->overflow)
587 		free(rec, M_INOTIFY);
588 	if ((watch->mask & IN_ONESHOT) != 0 ||
589 	    (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) {
590 		if (!allocfail) {
591 			rec = inotify_alloc_record(watch->wd, NULL, 0,
592 			    IN_IGNORED, 0, M_NOWAIT);
593 			if (rec == NULL)
594 				rec = &sc->overflow;
595 			if (!inotify_queue_record(sc, rec) &&
596 			    rec != &sc->overflow)
597 				free(rec, M_INOTIFY);
598 		}
599 
600 		/*
601 		 * Remove the watch, taking care to handle races with
602 		 * inotify_close().
603 		 */
604 		key.wd = watch->wd;
605 		if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
606 			RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
607 			inotify_unlink_watch_locked(sc, watch);
608 			free(watch, M_INOTIFY);
609 
610 			/* Defer vrele() to until locks are dropped. */
611 			relecount++;
612 		}
613 	}
614 	mtx_unlock(&sc->lock);
615 	return (relecount);
616 }
617 
618 void
619 inotify_log(struct vnode *vp, const char *name, size_t namelen, int event,
620     uint32_t cookie)
621 {
622 	struct inotify_watch *watch, *tmp;
623 	int relecount;
624 
625 	KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0,
626 	    ("inotify_log: invalid event %#x", event));
627 
628 	relecount = 0;
629 	mtx_lock(&vp->v_pollinfo->vpi_lock);
630 	TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
631 		KASSERT(watch->vp == vp,
632 		    ("inotify_log: watch %p vp != vp", watch));
633 		if ((watch->mask & event) != 0 || event == IN_UNMOUNT) {
634 			relecount += inotify_log_one(watch, name, namelen, event,
635 			    cookie);
636 		}
637 	}
638 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
639 
640 	for (int i = 0; i < relecount; i++)
641 		vrele(vp);
642 }
643 
644 /*
645  * An inotify event occurred on a watched vnode.
646  */
647 void
648 vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
649     int event, uint32_t cookie)
650 {
651 	int isdir;
652 
653 	VNPASS(vp->v_holdcnt > 0, vp);
654 
655 	isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
656 
657 	if (dvp != NULL) {
658 		VNPASS(dvp->v_holdcnt > 0, dvp);
659 
660 		/*
661 		 * Should we log an event for the vnode itself?
662 		 */
663 		if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
664 			int selfevent;
665 
666 			switch (event) {
667 			case _IN_MOVE_DELETE:
668 			case IN_DELETE:
669 				/*
670 				 * IN_DELETE_SELF is only generated when the
671 				 * last hard link of a file is removed.
672 				 */
673 				selfevent = IN_DELETE_SELF;
674 				if (vp->v_type != VDIR) {
675 					struct vattr va;
676 					int error;
677 
678 					error = VOP_GETATTR(vp, &va, cnp->cn_cred);
679 					if (error == 0 && va.va_nlink != 0)
680 						selfevent = 0;
681 				}
682 				break;
683 			case IN_MOVED_FROM:
684 				cookie = 0;
685 				selfevent = IN_MOVE_SELF;
686 				break;
687 			case _IN_ATTRIB_LINKCOUNT:
688 				selfevent = IN_ATTRIB;
689 				break;
690 			default:
691 				selfevent = event;
692 				break;
693 			}
694 
695 			if ((selfevent & ~_IN_DIR_EVENTS) != 0) {
696 				inotify_log(vp, NULL, 0, selfevent | isdir,
697 				    cookie);
698 			}
699 		}
700 
701 		/*
702 		 * Something is watching the directory through which this vnode
703 		 * was referenced, so we may need to log the event.
704 		 */
705 		if ((event & IN_ALL_EVENTS) != 0 &&
706 		    (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
707 			inotify_log(dvp, cnp->cn_nameptr,
708 			    cnp->cn_namelen, event | isdir, cookie);
709 		}
710 	} else {
711 		/*
712 		 * We don't know which watched directory might contain the
713 		 * vnode, so we have to fall back to searching the name cache.
714 		 */
715 		cache_vop_inotify(vp, event, cookie);
716 	}
717 }
718 
719 int
720 vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
721     uint32_t *wdp, struct thread *td)
722 {
723 	struct inotify_watch *watch, *watch1;
724 	uint32_t wd;
725 
726 	/*
727 	 * If this is a directory, make sure all of its entries are present in
728 	 * the name cache so that we're able to look them up if an event occurs.
729 	 * The persistent reference on the directory prevents the outgoing name
730 	 * cache entries from being reclaimed.
731 	 */
732 	if (vp->v_type == VDIR) {
733 		struct dirent *dp;
734 		char *buf;
735 		off_t off;
736 		size_t buflen, len;
737 		int eof, error;
738 
739 		buflen = 128 * sizeof(struct dirent);
740 		buf = malloc(buflen, M_TEMP, M_WAITOK);
741 
742 		error = 0;
743 		len = off = eof = 0;
744 		for (;;) {
745 			struct nameidata nd;
746 
747 			error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
748 			    &len, &off, &eof);
749 			if (error != 0)
750 				break;
751 			if (len == 0)
752 				/* Finished reading. */
753 				break;
754 			if (strcmp(dp->d_name, ".") == 0 ||
755 			    strcmp(dp->d_name, "..") == 0)
756 				continue;
757 
758 			/*
759 			 * namei() consumes a reference on the starting
760 			 * directory if it's specified as a vnode.
761 			 */
762 			vrefact(vp);
763 			NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
764 			    dp->d_name, vp);
765 			error = namei(&nd);
766 			if (error != 0)
767 				break;
768 			vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
769 			vrele(nd.ni_vp);
770 		}
771 		free(buf, M_TEMP);
772 		if (error != 0)
773 			return (error);
774 	}
775 
776 	/*
777 	 * The vnode referenced in kern_inotify_add_watch() might be different
778 	 * than this one if nullfs is in the picture.
779 	 */
780 	vrefact(vp);
781 	watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO);
782 	watch->sc = sc;
783 	watch->vp = vp;
784 	watch->mask = mask;
785 
786 	/*
787 	 * Are we updating an existing watch?  Search the vnode's list rather
788 	 * than that of the softc, as the former is likely to be shorter.
789 	 */
790 	v_addpollinfo(vp);
791 	mtx_lock(&vp->v_pollinfo->vpi_lock);
792 	TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
793 		if (watch1->sc == sc)
794 			break;
795 	}
796 	mtx_lock(&sc->lock);
797 	if (watch1 != NULL) {
798 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
799 
800 		/*
801 		 * We found an existing watch, update it based on our flags.
802 		 */
803 		if ((mask & IN_MASK_CREATE) != 0) {
804 			mtx_unlock(&sc->lock);
805 			vrele(vp);
806 			free(watch, M_INOTIFY);
807 			return (EEXIST);
808 		}
809 		if ((mask & IN_MASK_ADD) != 0)
810 			watch1->mask |= mask;
811 		else
812 			watch1->mask = mask;
813 		*wdp = watch1->wd;
814 		mtx_unlock(&sc->lock);
815 		vrele(vp);
816 		free(watch, M_INOTIFY);
817 		return (EJUSTRETURN);
818 	}
819 
820 	/*
821 	 * We're creating a new watch.  Add it to the softc and vnode watch
822 	 * lists.
823 	 */
824 	do {
825 		struct inotify_watch key;
826 
827 		/*
828 		 * Search for the next available watch descriptor.  This is
829 		 * implemented so as to avoid reusing watch descriptors for as
830 		 * long as possible.
831 		 */
832 		key.wd = wd = sc->nextwatch++;
833 		watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
834 	} while (watch1 != NULL || wd == 0);
835 	watch->wd = wd;
836 	RB_INSERT(inotify_watch_tree, &sc->watches, watch);
837 	TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
838 	mtx_unlock(&sc->lock);
839 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
840 	vn_irflag_set_cond(vp, VIRF_INOTIFY);
841 
842 	*wdp = wd;
843 
844 	return (0);
845 }
846 
847 void
848 vn_inotify_revoke(struct vnode *vp)
849 {
850 	if (vp->v_pollinfo == NULL) {
851 		/* This is a nullfs vnode which shadows a watched vnode. */
852 		return;
853 	}
854 	inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
855 }
856 
857 static int
858 fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp,
859     struct file **fpp)
860 {
861 	struct file *fp;
862 	int error;
863 
864 	error = fget(td, fd, needrightsp, &fp);
865 	if (error != 0)
866 		return (error);
867 	if (fp->f_type != DTYPE_INOTIFY) {
868 		fdrop(fp, td);
869 		return (EINVAL);
870 	}
871 	*fpp = fp;
872 	return (0);
873 }
874 
875 int
876 kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
877     struct thread *td)
878 {
879 	struct nameidata nd;
880 	struct file *fp;
881 	struct inotify_softc *sc;
882 	struct vnode *vp;
883 	uint32_t wd;
884 	int count, error;
885 
886 	fp = NULL;
887 	vp = NULL;
888 
889 	if ((mask & IN_ALL_EVENTS) == 0)
890 		return (EXTERROR(EINVAL, "no events specified"));
891 	if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) ==
892 	    (IN_MASK_ADD | IN_MASK_CREATE))
893 		return (EXTERROR(EINVAL,
894 		    "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"));
895 	if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0)
896 		return (EXTERROR(EINVAL, "unrecognized flag"));
897 
898 	error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
899 	if (error != 0)
900 		return (error);
901 	sc = fp->f_data;
902 
903 	NDINIT_AT(&nd, LOOKUP,
904 	    ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF |
905 	    LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd);
906 	error = namei(&nd);
907 	if (error != 0)
908 		goto out;
909 	NDFREE_PNBUF(&nd);
910 	vp = nd.ni_vp;
911 
912 	error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
913 	if (error != 0)
914 		goto out;
915 
916 	if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
917 		error = ENOTDIR;
918 		goto out;
919 	}
920 
921 	count = atomic_fetchadd_int(&inotify_watches, 1);
922 	if (count > inotify_max_watches) {
923 		atomic_subtract_int(&inotify_watches, 1);
924 		error = ENOSPC;
925 		goto out;
926 	}
927 	if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
928 	    inotify_max_user_watches)) {
929 		atomic_subtract_int(&inotify_watches, 1);
930 		error = ENOSPC;
931 		goto out;
932 	}
933 	error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
934 	if (error != 0) {
935 		atomic_subtract_int(&inotify_watches, 1);
936 		(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
937 		if (error == EJUSTRETURN) {
938 			/* We updated an existing watch, everything is ok. */
939 			error = 0;
940 		} else {
941 			goto out;
942 		}
943 	}
944 	td->td_retval[0] = wd;
945 
946 out:
947 	if (vp != NULL)
948 		vput(vp);
949 	fdrop(fp, td);
950 	return (error);
951 }
952 
953 int
954 sys_inotify_add_watch_at(struct thread *td,
955     struct inotify_add_watch_at_args *uap)
956 {
957 	return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
958 	    uap->mask, td));
959 }
960 
961 int
962 kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
963 {
964 	struct file *fp;
965 	struct inotify_softc *sc;
966 	struct inotify_record *rec;
967 	struct inotify_watch key, *watch;
968 	int error;
969 
970 	error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
971 	if (error != 0)
972 		return (error);
973 	sc = fp->f_data;
974 
975 	rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
976 
977 	/*
978 	 * For compatibility with Linux, we do not remove pending events
979 	 * associated with the watch.  Watch descriptors are implemented so as
980 	 * to avoid being reused for as long as possible, so one hopes that any
981 	 * pending events from the removed watch descriptor will be removed
982 	 * before the watch descriptor is recycled.
983 	 */
984 	key.wd = wd;
985 	mtx_lock(&sc->lock);
986 	watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
987 	if (watch == NULL) {
988 		free(rec, M_INOTIFY);
989 		error = EINVAL;
990 	} else {
991 		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
992 		if (!inotify_queue_record(sc, rec)) {
993 			free(rec, M_INOTIFY);
994 			error = 0;
995 		}
996 	}
997 	mtx_unlock(&sc->lock);
998 	if (watch != NULL)
999 		inotify_remove_watch(watch);
1000 	fdrop(fp, td);
1001 	return (error);
1002 }
1003 
1004 int
1005 sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap)
1006 {
1007 	return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
1008 }
1009