xref: /freebsd/sys/kern/vfs_inotify.c (revision da8ab13249420e85935b89794f333f0755e56385)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2025 Klara, Inc.
5  */
6 
7 #include "opt_ktrace.h"
8 
9 #include <sys/param.h>
10 #include <sys/systm.h>
11 #include <sys/caprights.h>
12 #include <sys/counter.h>
13 #include <sys/dirent.h>
14 #define	EXTERR_CATEGORY	EXTERR_CAT_INOTIFY
15 #include <sys/exterrvar.h>
16 #include <sys/fcntl.h>
17 #include <sys/file.h>
18 #include <sys/filio.h>
19 #include <sys/inotify.h>
20 #include <sys/kernel.h>
21 #include <sys/lock.h>
22 #include <sys/ktrace.h>
23 #include <sys/malloc.h>
24 #include <sys/mutex.h>
25 #include <sys/namei.h>
26 #include <sys/poll.h>
27 #include <sys/proc.h>
28 #include <sys/queue.h>
29 #include <sys/resourcevar.h>
30 #include <sys/selinfo.h>
31 #include <sys/stat.h>
32 #include <sys/syscallsubr.h>
33 #include <sys/sysctl.h>
34 #include <sys/sysent.h>
35 #include <sys/syslimits.h>
36 #include <sys/sysproto.h>
37 #include <sys/taskqueue.h>
38 #include <sys/tree.h>
39 #include <sys/user.h>
40 #include <sys/vnode.h>
41 
42 uint32_t inotify_rename_cookie;
43 
44 static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
45     "inotify configuration");
46 
47 static int inotify_max_queued_events = 16384;
48 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
49     &inotify_max_queued_events, 0,
50     "Maximum number of events to queue on an inotify descriptor");
51 
52 static int inotify_max_user_instances = 256;
53 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
54     &inotify_max_user_instances, 0,
55     "Maximum number of inotify descriptors per user");
56 
57 static int inotify_max_user_watches;
58 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
59     &inotify_max_user_watches, 0,
60     "Maximum number of inotify watches per user");
61 
62 static int inotify_max_watches;
63 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
64     &inotify_max_watches, 0,
65     "Maximum number of inotify watches system-wide");
66 
67 static int inotify_watches;
68 SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
69     &inotify_watches, 0,
70     "Total number of inotify watches currently in use");
71 
72 static int inotify_coalesce = 1;
73 SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
74     &inotify_coalesce, 0,
75     "Coalesce inotify events when possible");
76 
77 static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
78 SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
79     &inotify_event_drops,
80     "Number of inotify events dropped due to limits or allocation failures");
81 
82 static fo_rdwr_t	inotify_read;
83 static fo_ioctl_t	inotify_ioctl;
84 static fo_poll_t	inotify_poll;
85 static fo_kqfilter_t	inotify_kqfilter;
86 static fo_stat_t	inotify_stat;
87 static fo_close_t	inotify_close;
88 static fo_fill_kinfo_t	inotify_fill_kinfo;
89 
90 static const struct fileops inotifyfdops = {
91 	.fo_read = inotify_read,
92 	.fo_write = invfo_rdwr,
93 	.fo_truncate = invfo_truncate,
94 	.fo_ioctl = inotify_ioctl,
95 	.fo_poll = inotify_poll,
96 	.fo_kqfilter = inotify_kqfilter,
97 	.fo_stat = inotify_stat,
98 	.fo_close = inotify_close,
99 	.fo_chmod = invfo_chmod,
100 	.fo_chown = invfo_chown,
101 	.fo_sendfile = invfo_sendfile,
102 	.fo_fill_kinfo = inotify_fill_kinfo,
103 	.fo_cmp = file_kcmp_generic,
104 	.fo_flags = DFLAG_PASSABLE,
105 };
106 
107 static void	filt_inotifydetach(struct knote *kn);
108 static int	filt_inotifyevent(struct knote *kn, long hint);
109 
110 static const struct filterops inotify_rfiltops = {
111 	.f_isfd = 1,
112 	.f_detach = filt_inotifydetach,
113 	.f_event = filt_inotifyevent,
114 	.f_copy = knote_triv_copy,
115 };
116 
117 static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
118 
119 struct inotify_record {
120 	STAILQ_ENTRY(inotify_record) link;
121 	struct inotify_event	ev;
122 };
123 
124 static uint64_t inotify_ino = 1;
125 
126 /*
127  * On LP64 systems this occupies 64 bytes, so we don't get internal
128  * fragmentation by allocating watches with malloc(9).  If the size changes,
129  * consider using a UMA zone to improve memory efficiency.
130  */
131 struct inotify_watch {
132 	struct inotify_softc *sc; /* back-pointer */
133 	int		wd;	/* unique ID */
134 	uint32_t	mask;	/* event mask */
135 	struct vnode	*vp;	/* vnode being watched, refed */
136 	RB_ENTRY(inotify_watch) ilink;		/* inotify linkage */
137 	TAILQ_ENTRY(inotify_watch) vlink;	/* vnode linkage */
138 };
139 
140 static void
inotify_init(void * arg __unused)141 inotify_init(void *arg __unused)
142 {
143 	/* Don't let a user hold too many vnodes. */
144 	inotify_max_user_watches = desiredvnodes / 3;
145 	/* Don't let the system hold too many vnodes. */
146 	inotify_max_watches = desiredvnodes / 2;
147 }
148 SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
149 
150 static int
inotify_watch_cmp(const struct inotify_watch * a,const struct inotify_watch * b)151 inotify_watch_cmp(const struct inotify_watch *a,
152     const struct inotify_watch *b)
153 {
154 	if (a->wd < b->wd)
155 		return (-1);
156 	else if (a->wd > b->wd)
157 		return (1);
158 	else
159 		return (0);
160 }
161 RB_HEAD(inotify_watch_tree, inotify_watch);
162 RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
163 
164 struct inotify_softc {
165 	struct mtx	lock;			/* serialize all softc writes */
166 	STAILQ_HEAD(, inotify_record) pending;	/* events waiting to be read */
167 	struct inotify_record overflow;		/* preallocated record */
168 	int		nextwatch;		/* next watch ID to try */
169 	int		npending;		/* number of pending events */
170 	size_t		nbpending;		/* bytes available to read */
171 	uint64_t	ino;			/* unique identifier */
172 	struct inotify_watch_tree watches;	/* active watches */
173 	TAILQ_HEAD(, inotify_watch) deadwatches; /* watches pending vrele() */
174 	struct task	reaptask;		/* task to reap dead watches */
175 	struct selinfo	sel;			/* select/poll/kevent info */
176 	struct ucred	*cred;			/* credential ref */
177 };
178 
179 static struct inotify_record *
inotify_dequeue(struct inotify_softc * sc)180 inotify_dequeue(struct inotify_softc *sc)
181 {
182 	struct inotify_record *rec;
183 
184 	mtx_assert(&sc->lock, MA_OWNED);
185 	KASSERT(!STAILQ_EMPTY(&sc->pending),
186 	    ("%s: queue for %p is empty", __func__, sc));
187 
188 	rec = STAILQ_FIRST(&sc->pending);
189 	STAILQ_REMOVE_HEAD(&sc->pending, link);
190 	sc->npending--;
191 	sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
192 	return (rec);
193 }
194 
195 static void
inotify_enqueue(struct inotify_softc * sc,struct inotify_record * rec,bool head)196 inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head)
197 {
198 	mtx_assert(&sc->lock, MA_OWNED);
199 
200 	if (head)
201 		STAILQ_INSERT_HEAD(&sc->pending, rec, link);
202 	else
203 		STAILQ_INSERT_TAIL(&sc->pending, rec, link);
204 	sc->npending++;
205 	sc->nbpending += sizeof(rec->ev) + rec->ev.len;
206 }
207 
208 static int
inotify_read(struct file * fp,struct uio * uio,struct ucred * cred,int flags,struct thread * td)209 inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
210     struct thread *td)
211 {
212 	struct inotify_softc *sc;
213 	struct inotify_record *rec;
214 	int error;
215 	bool first;
216 
217 	sc = fp->f_data;
218 	error = 0;
219 
220 	mtx_lock(&sc->lock);
221 	while (STAILQ_EMPTY(&sc->pending)) {
222 		if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
223 			mtx_unlock(&sc->lock);
224 			return (EWOULDBLOCK);
225 		}
226 		error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
227 		if (error != 0) {
228 			mtx_unlock(&sc->lock);
229 			return (error);
230 		}
231 	}
232 	for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
233 		size_t len;
234 
235 		rec = inotify_dequeue(sc);
236 		len = sizeof(rec->ev) + rec->ev.len;
237 		if (uio->uio_resid < (ssize_t)len) {
238 			inotify_enqueue(sc, rec, true);
239 			if (first) {
240 				error = EXTERROR(EINVAL,
241 				    "read buffer is too small");
242 			}
243 			break;
244 		}
245 		mtx_unlock(&sc->lock);
246 		error = uiomove(&rec->ev, len, uio);
247 #ifdef KTRACE
248 		if (error == 0 && KTRPOINT(td, KTR_STRUCT))
249 			ktrstruct("inotify", &rec->ev, len);
250 #endif
251 		mtx_lock(&sc->lock);
252 		if (error != 0) {
253 			inotify_enqueue(sc, rec, true);
254 			mtx_unlock(&sc->lock);
255 			return (error);
256 		}
257 		if (rec == &sc->overflow) {
258 			/*
259 			 * Signal to inotify_queue_record() that the overflow
260 			 * record can be reused.
261 			 */
262 			memset(rec, 0, sizeof(*rec));
263 		} else {
264 			free(rec, M_INOTIFY);
265 		}
266 	}
267 	mtx_unlock(&sc->lock);
268 	return (error);
269 }
270 
271 static int
inotify_ioctl(struct file * fp,u_long com,void * data,struct ucred * cred,struct thread * td)272 inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
273     struct thread *td)
274 {
275 	struct inotify_softc *sc;
276 
277 	sc = fp->f_data;
278 
279 	switch (com) {
280 	case FIONREAD:
281 		*(int *)data = (int)sc->nbpending;
282 		return (0);
283 	case FIONBIO:
284 	case FIOASYNC:
285 		return (0);
286 	default:
287 		return (ENOTTY);
288 	}
289 
290 	return (0);
291 }
292 
293 static int
inotify_poll(struct file * fp,int events,struct ucred * cred,struct thread * td)294 inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
295 {
296 	struct inotify_softc *sc;
297 	int revents;
298 
299 	sc = fp->f_data;
300 	revents = 0;
301 
302 	mtx_lock(&sc->lock);
303 	if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
304 		revents |= events & (POLLIN | POLLRDNORM);
305 	else
306 		selrecord(td, &sc->sel);
307 	mtx_unlock(&sc->lock);
308 	return (revents);
309 }
310 
311 static void
filt_inotifydetach(struct knote * kn)312 filt_inotifydetach(struct knote *kn)
313 {
314 	struct inotify_softc *sc;
315 
316 	sc = kn->kn_hook;
317 	knlist_remove(&sc->sel.si_note, kn, 0);
318 }
319 
320 static int
filt_inotifyevent(struct knote * kn,long hint)321 filt_inotifyevent(struct knote *kn, long hint)
322 {
323 	struct inotify_softc *sc;
324 
325 	sc = kn->kn_hook;
326 	mtx_assert(&sc->lock, MA_OWNED);
327 	kn->kn_data = sc->nbpending;
328 	return (kn->kn_data > 0);
329 }
330 
331 static int
inotify_kqfilter(struct file * fp,struct knote * kn)332 inotify_kqfilter(struct file *fp, struct knote *kn)
333 {
334 	struct inotify_softc *sc;
335 
336 	if (kn->kn_filter != EVFILT_READ)
337 		return (EINVAL);
338 	sc = fp->f_data;
339 	kn->kn_fop = &inotify_rfiltops;
340 	kn->kn_hook = sc;
341 	knlist_add(&sc->sel.si_note, kn, 0);
342 	return (0);
343 }
344 
345 static int
inotify_stat(struct file * fp,struct stat * sb,struct ucred * cred)346 inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
347 {
348 	struct inotify_softc *sc;
349 
350 	sc = fp->f_data;
351 
352 	memset(sb, 0, sizeof(*sb));
353 	sb->st_mode = S_IFREG | S_IRUSR;
354 	sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
355 	mtx_lock(&sc->lock);
356 	sb->st_size = sc->nbpending;
357 	sb->st_blocks = sc->npending;
358 	sb->st_uid = sc->cred->cr_ruid;
359 	sb->st_gid = sc->cred->cr_rgid;
360 	sb->st_ino = sc->ino;
361 	mtx_unlock(&sc->lock);
362 	return (0);
363 }
364 
365 static void
inotify_unlink_watch_locked(struct inotify_softc * sc,struct inotify_watch * watch)366 inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch)
367 {
368 	struct vnode *vp;
369 
370 	vp = watch->vp;
371 	mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
372 
373 	atomic_subtract_int(&inotify_watches, 1);
374 	(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
375 
376 	TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
377 	if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
378 		vn_irflag_unset(vp, VIRF_INOTIFY);
379 }
380 
381 static void
inotify_free_watch(struct inotify_watch * watch)382 inotify_free_watch(struct inotify_watch *watch)
383 {
384 	/*
385 	 * Formally, we don't need to lock the vnode here.  However, if we
386 	 * don't, and vrele() releases the last reference, it's possible the
387 	 * vnode will be recycled while a different thread holds the vnode lock.
388 	 * Work around this bug by acquiring the lock here.
389 	 */
390 	(void)vn_lock(watch->vp, LK_EXCLUSIVE | LK_RETRY);
391 	vput(watch->vp);
392 	free(watch, M_INOTIFY);
393 }
394 
395 /*
396  * Assumes that the watch has already been removed from its softc.
397  */
398 static void
inotify_remove_watch(struct inotify_watch * watch)399 inotify_remove_watch(struct inotify_watch *watch)
400 {
401 	struct inotify_softc *sc;
402 	struct vnode *vp;
403 
404 	sc = watch->sc;
405 
406 	vp = watch->vp;
407 	mtx_lock(&vp->v_pollinfo->vpi_lock);
408 	inotify_unlink_watch_locked(sc, watch);
409 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
410 	inotify_free_watch(watch);
411 }
412 
413 static void
inotify_reap(void * arg,int pending)414 inotify_reap(void *arg, int pending)
415 {
416 	struct inotify_softc *sc;
417 	struct inotify_watch *watch;
418 
419 	sc = arg;
420 	mtx_lock(&sc->lock);
421 	while ((watch = TAILQ_FIRST(&sc->deadwatches)) != NULL) {
422 		TAILQ_REMOVE(&sc->deadwatches, watch, vlink);
423 		mtx_unlock(&sc->lock);
424 		inotify_free_watch(watch);
425 		mtx_lock(&sc->lock);
426 	}
427 	mtx_unlock(&sc->lock);
428 }
429 
430 static int
inotify_close(struct file * fp,struct thread * td)431 inotify_close(struct file *fp, struct thread *td)
432 {
433 	struct inotify_softc *sc;
434 	struct inotify_record *rec;
435 	struct inotify_watch *watch;
436 
437 	sc = fp->f_data;
438 
439 	/* Detach watches from their vnodes. */
440 	mtx_lock(&sc->lock);
441 	(void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
442 	while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
443 		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
444 		mtx_unlock(&sc->lock);
445 		inotify_remove_watch(watch);
446 		mtx_lock(&sc->lock);
447 	}
448 
449 	/* Make sure that any asynchronous vrele() calls are done. */
450 	mtx_unlock(&sc->lock);
451 	taskqueue_drain(taskqueue_thread, &sc->reaptask);
452 	mtx_lock(&sc->lock);
453 	KASSERT(RB_EMPTY(&sc->watches),
454 	    ("%s: watches not empty in %p", __func__, sc));
455 	KASSERT(TAILQ_EMPTY(&sc->deadwatches),
456 	    ("%s: deadwatches not empty in %p", __func__, sc));
457 
458 	/* Drop pending events. */
459 	while (!STAILQ_EMPTY(&sc->pending)) {
460 		rec = inotify_dequeue(sc);
461 		if (rec != &sc->overflow)
462 			free(rec, M_INOTIFY);
463 	}
464 	mtx_unlock(&sc->lock);
465 	seldrain(&sc->sel);
466 	knlist_destroy(&sc->sel.si_note);
467 	mtx_destroy(&sc->lock);
468 	crfree(sc->cred);
469 	free(sc, M_INOTIFY);
470 	return (0);
471 }
472 
473 static int
inotify_fill_kinfo(struct file * fp,struct kinfo_file * kif,struct filedesc * fdp)474 inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
475     struct filedesc *fdp)
476 {
477 	struct inotify_softc *sc;
478 
479 	sc = fp->f_data;
480 
481 	kif->kf_type = KF_TYPE_INOTIFY;
482 	kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
483 	kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
484 	return (0);
485 }
486 
487 int
inotify_create_file(struct thread * td,struct file * fp,int flags,int * fflagsp)488 inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp)
489 {
490 	struct inotify_softc *sc;
491 	int fflags;
492 
493 	if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
494 		return (EINVAL);
495 
496 	if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
497 	    inotify_max_user_instances))
498 		return (EMFILE);
499 
500 	sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
501 	sc->nextwatch = 1; /* Required for compatibility. */
502 	STAILQ_INIT(&sc->pending);
503 	RB_INIT(&sc->watches);
504 	TAILQ_INIT(&sc->deadwatches);
505 	TASK_INIT(&sc->reaptask, 0, inotify_reap, sc);
506 	mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
507 	knlist_init_mtx(&sc->sel.si_note, &sc->lock);
508 	sc->cred = crhold(td->td_ucred);
509 	sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
510 
511 	fflags = FREAD;
512 	if ((flags & IN_NONBLOCK) != 0)
513 		fflags |= FNONBLOCK;
514 	if ((flags & IN_CLOEXEC) != 0)
515 		*fflagsp |= O_CLOEXEC;
516 	finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
517 
518 	return (0);
519 }
520 
521 static struct inotify_record *
inotify_alloc_record(uint32_t wd,const char * name,size_t namelen,int event,uint32_t cookie,int waitok)522 inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
523     uint32_t cookie, int waitok)
524 {
525 	struct inotify_event *evp;
526 	struct inotify_record *rec;
527 
528 	rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
529 	    waitok | M_ZERO);
530 	if (rec == NULL)
531 		return (NULL);
532 	evp = &rec->ev;
533 	evp->wd = wd;
534 	evp->mask = event;
535 	evp->cookie = cookie;
536 	evp->len = _IN_NAMESIZE(namelen);
537 	if (name != NULL)
538 		memcpy(evp->name, name, namelen);
539 	return (rec);
540 }
541 
542 static bool
inotify_can_coalesce(struct inotify_softc * sc,struct inotify_event * evp)543 inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp)
544 {
545 	struct inotify_record *prev;
546 
547 	mtx_assert(&sc->lock, MA_OWNED);
548 
549 	prev = STAILQ_LAST(&sc->pending, inotify_record, link);
550 	return (prev != NULL && prev->ev.mask == evp->mask &&
551 	    prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie &&
552 	    prev->ev.len == evp->len &&
553 	    memcmp(prev->ev.name, evp->name, evp->len) == 0);
554 }
555 
556 static void
inotify_overflow_event(struct inotify_event * evp)557 inotify_overflow_event(struct inotify_event *evp)
558 {
559 	evp->mask = IN_Q_OVERFLOW;
560 	evp->wd = -1;
561 	evp->cookie = 0;
562 	evp->len = 0;
563 }
564 
565 /*
566  * Put an event record on the queue for an inotify desscriptor.  Return false if
567  * the record was not enqueued for some reason, true otherwise.
568  */
569 static bool
inotify_queue_record(struct inotify_softc * sc,struct inotify_record * rec)570 inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec)
571 {
572 	struct inotify_event *evp;
573 
574 	mtx_assert(&sc->lock, MA_OWNED);
575 
576 	evp = &rec->ev;
577 	if (__predict_false(rec == &sc->overflow)) {
578 		/*
579 		 * Is the overflow record already in the queue?  If so, there's
580 		 * not much else we can do: we're here because a kernel memory
581 		 * shortage prevented new record allocations.
582 		 */
583 		counter_u64_add(inotify_event_drops, 1);
584 		if (evp->mask == IN_Q_OVERFLOW)
585 			return (false);
586 		inotify_overflow_event(evp);
587 	} else {
588 		/* Try to coalesce duplicate events. */
589 		if (inotify_coalesce && inotify_can_coalesce(sc, evp))
590 			return (false);
591 
592 		/*
593 		 * Would this one overflow the queue?  If so, convert it to an
594 		 * overflow event and try again to coalesce.
595 		 */
596 		if (sc->npending >= inotify_max_queued_events) {
597 			counter_u64_add(inotify_event_drops, 1);
598 			inotify_overflow_event(evp);
599 			if (inotify_can_coalesce(sc, evp))
600 				return (false);
601 		}
602 	}
603 	inotify_enqueue(sc, rec, false);
604 	selwakeup(&sc->sel);
605 	KNOTE_LOCKED(&sc->sel.si_note, 0);
606 	wakeup(&sc->pending);
607 	return (true);
608 }
609 
610 static void
inotify_log_one(struct inotify_watch * watch,const char * name,size_t namelen,int event,uint32_t cookie)611 inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen,
612     int event, uint32_t cookie)
613 {
614 	struct inotify_watch key;
615 	struct inotify_softc *sc;
616 	struct inotify_record *rec;
617 	bool allocfail;
618 
619 	mtx_assert(&watch->vp->v_pollinfo->vpi_lock, MA_OWNED);
620 
621 	sc = watch->sc;
622 	rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
623 	    M_NOWAIT);
624 	if (rec == NULL) {
625 		rec = &sc->overflow;
626 		allocfail = true;
627 	} else {
628 		allocfail = false;
629 	}
630 
631 	mtx_lock(&sc->lock);
632 	if (!inotify_queue_record(sc, rec) && rec != &sc->overflow)
633 		free(rec, M_INOTIFY);
634 	if ((watch->mask & IN_ONESHOT) != 0 ||
635 	    (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) {
636 		if (!allocfail) {
637 			rec = inotify_alloc_record(watch->wd, NULL, 0,
638 			    IN_IGNORED, 0, M_NOWAIT);
639 			if (rec == NULL)
640 				rec = &sc->overflow;
641 			if (!inotify_queue_record(sc, rec) &&
642 			    rec != &sc->overflow)
643 				free(rec, M_INOTIFY);
644 		}
645 
646 		/*
647 		 * Remove the watch, taking care to handle races with
648 		 * inotify_close().  The thread that removes the watch is
649 		 * responsible for freeing it.
650 		 */
651 		key.wd = watch->wd;
652 		if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
653 			RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
654 			inotify_unlink_watch_locked(sc, watch);
655 
656 			/*
657 			 * Defer the vrele() to a sleepable thread context.
658 			 */
659 			TAILQ_INSERT_TAIL(&sc->deadwatches, watch, vlink);
660 			taskqueue_enqueue(taskqueue_thread, &sc->reaptask);
661 		}
662 	}
663 	mtx_unlock(&sc->lock);
664 }
665 
666 void
inotify_log(struct vnode * vp,const char * name,size_t namelen,int event,uint32_t cookie)667 inotify_log(struct vnode *vp, const char *name, size_t namelen, int event,
668     uint32_t cookie)
669 {
670 	struct inotify_watch *watch, *tmp;
671 
672 	KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0,
673 	    ("inotify_log: invalid event %#x", event));
674 
675 	mtx_lock(&vp->v_pollinfo->vpi_lock);
676 	TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
677 		KASSERT(watch->vp == vp,
678 		    ("inotify_log: watch %p vp != vp", watch));
679 		if ((watch->mask & event) != 0 || event == IN_UNMOUNT)
680 			inotify_log_one(watch, name, namelen, event, cookie);
681 	}
682 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
683 }
684 
685 /*
686  * An inotify event occurred on a watched vnode.
687  */
688 void
vn_inotify(struct vnode * vp,struct vnode * dvp,struct componentname * cnp,int event,uint32_t cookie)689 vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
690     int event, uint32_t cookie)
691 {
692 	int isdir;
693 
694 	VNPASS(vp->v_holdcnt > 0, vp);
695 
696 	isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
697 
698 	if (dvp != NULL) {
699 		VNPASS(dvp->v_holdcnt > 0, dvp);
700 
701 		/*
702 		 * Should we log an event for the vnode itself?
703 		 */
704 		if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
705 			int selfevent;
706 
707 			switch (event) {
708 			case _IN_MOVE_DELETE:
709 			case IN_DELETE:
710 				/*
711 				 * IN_DELETE_SELF is only generated when the
712 				 * last hard link of a file is removed.
713 				 */
714 				selfevent = IN_DELETE_SELF;
715 				if (vp->v_type != VDIR) {
716 					struct vattr va;
717 					int error;
718 
719 					error = VOP_GETATTR(vp, &va,
720 					    cnp->cn_cred);
721 					if (error == 0 && va.va_nlink != 0)
722 						selfevent = 0;
723 				}
724 				break;
725 			case IN_MOVED_FROM:
726 				selfevent = IN_MOVE_SELF;
727 				break;
728 			case _IN_ATTRIB_LINKCOUNT:
729 				selfevent = IN_ATTRIB;
730 				break;
731 			default:
732 				selfevent = event;
733 				break;
734 			}
735 
736 			if ((selfevent & ~_IN_DIR_EVENTS) != 0)
737 				inotify_log(vp, NULL, 0, selfevent | isdir, 0);
738 		}
739 
740 		/*
741 		 * Something is watching the directory through which this vnode
742 		 * was referenced, so we may need to log the event.
743 		 */
744 		if ((event & IN_ALL_EVENTS) != 0 &&
745 		    (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
746 			inotify_log(dvp, cnp->cn_nameptr,
747 			    cnp->cn_namelen, event | isdir, cookie);
748 		}
749 	} else {
750 		/*
751 		 * We don't know which watched directory might contain the
752 		 * vnode, so we have to fall back to searching the name cache.
753 		 */
754 		cache_vop_inotify(vp, event, cookie);
755 	}
756 }
757 
758 int
vn_inotify_add_watch(struct vnode * vp,struct inotify_softc * sc,uint32_t mask,uint32_t * wdp,struct thread * td)759 vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
760     uint32_t *wdp, struct thread *td)
761 {
762 	struct inotify_watch *watch, *watch1;
763 	uint32_t wd;
764 
765 	/*
766 	 * If this is a directory, make sure all of its entries are present in
767 	 * the name cache so that we're able to look them up if an event occurs.
768 	 * The persistent reference on the directory prevents the outgoing name
769 	 * cache entries from being reclaimed.
770 	 */
771 	if (vp->v_type == VDIR) {
772 		struct dirent *dp;
773 		char *buf;
774 		off_t off;
775 		size_t buflen, len;
776 		int eof, error;
777 
778 		buflen = 128 * sizeof(struct dirent);
779 		buf = malloc(buflen, M_TEMP, M_WAITOK);
780 
781 		error = 0;
782 		len = off = eof = 0;
783 		for (;;) {
784 			struct nameidata nd;
785 
786 			error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
787 			    &len, &off, &eof);
788 			if (error != 0)
789 				break;
790 			if (len == 0)
791 				/* Finished reading. */
792 				break;
793 			if (strcmp(dp->d_name, ".") == 0 ||
794 			    strcmp(dp->d_name, "..") == 0)
795 				continue;
796 
797 			/*
798 			 * namei() consumes a reference on the starting
799 			 * directory if it's specified as a vnode.
800 			 */
801 			vrefact(vp);
802 			VOP_UNLOCK(vp);
803 			NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
804 			    dp->d_name, vp);
805 			error = namei(&nd);
806 			vn_lock(vp, LK_SHARED | LK_RETRY);
807 			if (error != 0)
808 				break;
809 			NDFREE_PNBUF(&nd);
810 			vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
811 			vrele(nd.ni_vp);
812 		}
813 		free(buf, M_TEMP);
814 		if (error != 0)
815 			return (error);
816 	}
817 
818 	/*
819 	 * The vnode referenced in kern_inotify_add_watch() might be different
820 	 * than this one if nullfs is in the picture.
821 	 */
822 	vrefact(vp);
823 	watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO);
824 	watch->sc = sc;
825 	watch->vp = vp;
826 	watch->mask = mask;
827 
828 	/*
829 	 * Are we updating an existing watch?  Search the vnode's list rather
830 	 * than that of the softc, as the former is likely to be shorter.
831 	 */
832 	v_addpollinfo(vp);
833 	mtx_lock(&vp->v_pollinfo->vpi_lock);
834 	TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
835 		if (watch1->sc == sc)
836 			break;
837 	}
838 	mtx_lock(&sc->lock);
839 	if (watch1 != NULL) {
840 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
841 
842 		/*
843 		 * We found an existing watch, update it based on our flags.
844 		 */
845 		if ((mask & IN_MASK_CREATE) != 0) {
846 			mtx_unlock(&sc->lock);
847 			vrele(vp);
848 			free(watch, M_INOTIFY);
849 			return (EEXIST);
850 		}
851 		if ((mask & IN_MASK_ADD) != 0)
852 			watch1->mask |= mask;
853 		else
854 			watch1->mask = mask;
855 		*wdp = watch1->wd;
856 		mtx_unlock(&sc->lock);
857 		vrele(vp);
858 		free(watch, M_INOTIFY);
859 		return (EJUSTRETURN);
860 	}
861 
862 	/*
863 	 * We're creating a new watch.  Add it to the softc and vnode watch
864 	 * lists.
865 	 */
866 	do {
867 		struct inotify_watch key;
868 
869 		/*
870 		 * Search for the next available watch descriptor.  This is
871 		 * implemented so as to avoid reusing watch descriptors for as
872 		 * long as possible.
873 		 */
874 		key.wd = wd = sc->nextwatch++;
875 		watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
876 	} while (watch1 != NULL || wd == 0);
877 	watch->wd = wd;
878 	RB_INSERT(inotify_watch_tree, &sc->watches, watch);
879 	TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
880 	mtx_unlock(&sc->lock);
881 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
882 	vn_irflag_set_cond(vp, VIRF_INOTIFY);
883 
884 	*wdp = wd;
885 
886 	return (0);
887 }
888 
889 void
vn_inotify_revoke(struct vnode * vp)890 vn_inotify_revoke(struct vnode *vp)
891 {
892 	if (vp->v_pollinfo == NULL) {
893 		/* This is a nullfs vnode which shadows a watched vnode. */
894 		return;
895 	}
896 	inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
897 }
898 
899 static int
fget_inotify(struct thread * td,int fd,const cap_rights_t * needrightsp,struct file ** fpp)900 fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp,
901     struct file **fpp)
902 {
903 	struct file *fp;
904 	int error;
905 
906 	error = fget(td, fd, needrightsp, &fp);
907 	if (error != 0)
908 		return (error);
909 	if (fp->f_type != DTYPE_INOTIFY) {
910 		fdrop(fp, td);
911 		return (EINVAL);
912 	}
913 	*fpp = fp;
914 	return (0);
915 }
916 
917 int
kern_inotify_add_watch(int fd,int dfd,const char * path,uint32_t mask,struct thread * td)918 kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
919     struct thread *td)
920 {
921 	struct nameidata nd;
922 	struct file *fp;
923 	struct inotify_softc *sc;
924 	struct vnode *vp;
925 	uint32_t wd;
926 	int count, error;
927 
928 	fp = NULL;
929 	vp = NULL;
930 
931 	if ((mask & IN_ALL_EVENTS) == 0)
932 		return (EXTERROR(EINVAL, "no events specified"));
933 	if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) ==
934 	    (IN_MASK_ADD | IN_MASK_CREATE))
935 		return (EXTERROR(EINVAL,
936 		    "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"));
937 	if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0)
938 		return (EXTERROR(EINVAL, "unrecognized flag"));
939 
940 	error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
941 	if (error != 0)
942 		return (error);
943 	sc = fp->f_data;
944 
945 	NDINIT_AT(&nd, LOOKUP,
946 	    ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF |
947 	    LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd);
948 	error = namei(&nd);
949 	if (error != 0)
950 		goto out;
951 	NDFREE_PNBUF(&nd);
952 	vp = nd.ni_vp;
953 
954 	error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
955 	if (error != 0)
956 		goto out;
957 
958 	if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
959 		error = ENOTDIR;
960 		goto out;
961 	}
962 
963 	count = atomic_fetchadd_int(&inotify_watches, 1);
964 	if (count > inotify_max_watches) {
965 		atomic_subtract_int(&inotify_watches, 1);
966 		error = ENOSPC;
967 		goto out;
968 	}
969 	if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
970 	    inotify_max_user_watches)) {
971 		atomic_subtract_int(&inotify_watches, 1);
972 		error = ENOSPC;
973 		goto out;
974 	}
975 	error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
976 	if (error != 0) {
977 		atomic_subtract_int(&inotify_watches, 1);
978 		(void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
979 		if (error == EJUSTRETURN) {
980 			/* We updated an existing watch, everything is ok. */
981 			error = 0;
982 		} else {
983 			goto out;
984 		}
985 	}
986 	td->td_retval[0] = wd;
987 
988 out:
989 	if (vp != NULL)
990 		vput(vp);
991 	fdrop(fp, td);
992 	return (error);
993 }
994 
995 int
sys_inotify_add_watch_at(struct thread * td,struct inotify_add_watch_at_args * uap)996 sys_inotify_add_watch_at(struct thread *td,
997     struct inotify_add_watch_at_args *uap)
998 {
999 	return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
1000 	    uap->mask, td));
1001 }
1002 
1003 int
kern_inotify_rm_watch(int fd,uint32_t wd,struct thread * td)1004 kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
1005 {
1006 	struct file *fp;
1007 	struct inotify_softc *sc;
1008 	struct inotify_record *rec;
1009 	struct inotify_watch key, *watch;
1010 	int error;
1011 
1012 	error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
1013 	if (error != 0)
1014 		return (error);
1015 	sc = fp->f_data;
1016 
1017 	rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
1018 
1019 	/*
1020 	 * For compatibility with Linux, we do not remove pending events
1021 	 * associated with the watch.  Watch descriptors are implemented so as
1022 	 * to avoid being reused for as long as possible, so one hopes that any
1023 	 * pending events from the removed watch descriptor will be removed
1024 	 * before the watch descriptor is recycled.
1025 	 */
1026 	key.wd = wd;
1027 	mtx_lock(&sc->lock);
1028 	watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
1029 	if (watch == NULL) {
1030 		free(rec, M_INOTIFY);
1031 		error = EINVAL;
1032 	} else {
1033 		RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
1034 		if (!inotify_queue_record(sc, rec)) {
1035 			free(rec, M_INOTIFY);
1036 			error = 0;
1037 		}
1038 	}
1039 	mtx_unlock(&sc->lock);
1040 	if (watch != NULL)
1041 		inotify_remove_watch(watch);
1042 	fdrop(fp, td);
1043 	return (error);
1044 }
1045 
1046 int
sys_inotify_rm_watch(struct thread * td,struct inotify_rm_watch_args * uap)1047 sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap)
1048 {
1049 	return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
1050 }
1051