1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2025 Klara, Inc.
5 */
6
7 #include "opt_ktrace.h"
8
9 #include <sys/param.h>
10 #include <sys/systm.h>
11 #include <sys/caprights.h>
12 #include <sys/counter.h>
13 #include <sys/dirent.h>
14 #define EXTERR_CATEGORY EXTERR_CAT_INOTIFY
15 #include <sys/exterrvar.h>
16 #include <sys/fcntl.h>
17 #include <sys/file.h>
18 #include <sys/filio.h>
19 #include <sys/inotify.h>
20 #include <sys/kernel.h>
21 #include <sys/lock.h>
22 #include <sys/ktrace.h>
23 #include <sys/malloc.h>
24 #include <sys/mutex.h>
25 #include <sys/namei.h>
26 #include <sys/poll.h>
27 #include <sys/proc.h>
28 #include <sys/queue.h>
29 #include <sys/resourcevar.h>
30 #include <sys/selinfo.h>
31 #include <sys/stat.h>
32 #include <sys/syscallsubr.h>
33 #include <sys/sysctl.h>
34 #include <sys/sysent.h>
35 #include <sys/syslimits.h>
36 #include <sys/sysproto.h>
37 #include <sys/taskqueue.h>
38 #include <sys/tree.h>
39 #include <sys/user.h>
40 #include <sys/vnode.h>
41
42 uint32_t inotify_rename_cookie;
43
44 static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
45 "inotify configuration");
46
47 static int inotify_max_queued_events = 16384;
48 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
49 &inotify_max_queued_events, 0,
50 "Maximum number of events to queue on an inotify descriptor");
51
52 static int inotify_max_user_instances = 256;
53 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
54 &inotify_max_user_instances, 0,
55 "Maximum number of inotify descriptors per user");
56
57 static int inotify_max_user_watches;
58 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
59 &inotify_max_user_watches, 0,
60 "Maximum number of inotify watches per user");
61
62 static int inotify_max_watches;
63 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
64 &inotify_max_watches, 0,
65 "Maximum number of inotify watches system-wide");
66
67 static int inotify_watches;
68 SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
69 &inotify_watches, 0,
70 "Total number of inotify watches currently in use");
71
72 static int inotify_coalesce = 1;
73 SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
74 &inotify_coalesce, 0,
75 "Coalesce inotify events when possible");
76
77 static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
78 SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
79 &inotify_event_drops,
80 "Number of inotify events dropped due to limits or allocation failures");
81
82 static fo_rdwr_t inotify_read;
83 static fo_ioctl_t inotify_ioctl;
84 static fo_poll_t inotify_poll;
85 static fo_kqfilter_t inotify_kqfilter;
86 static fo_stat_t inotify_stat;
87 static fo_close_t inotify_close;
88 static fo_fill_kinfo_t inotify_fill_kinfo;
89
90 static const struct fileops inotifyfdops = {
91 .fo_read = inotify_read,
92 .fo_write = invfo_rdwr,
93 .fo_truncate = invfo_truncate,
94 .fo_ioctl = inotify_ioctl,
95 .fo_poll = inotify_poll,
96 .fo_kqfilter = inotify_kqfilter,
97 .fo_stat = inotify_stat,
98 .fo_close = inotify_close,
99 .fo_chmod = invfo_chmod,
100 .fo_chown = invfo_chown,
101 .fo_sendfile = invfo_sendfile,
102 .fo_fill_kinfo = inotify_fill_kinfo,
103 .fo_cmp = file_kcmp_generic,
104 .fo_flags = DFLAG_PASSABLE,
105 };
106
107 static void filt_inotifydetach(struct knote *kn);
108 static int filt_inotifyevent(struct knote *kn, long hint);
109
110 static const struct filterops inotify_rfiltops = {
111 .f_isfd = 1,
112 .f_detach = filt_inotifydetach,
113 .f_event = filt_inotifyevent,
114 .f_copy = knote_triv_copy,
115 };
116
117 static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
118
119 struct inotify_record {
120 STAILQ_ENTRY(inotify_record) link;
121 struct inotify_event ev;
122 };
123
124 static uint64_t inotify_ino = 1;
125
126 /*
127 * On LP64 systems this occupies 64 bytes, so we don't get internal
128 * fragmentation by allocating watches with malloc(9). If the size changes,
129 * consider using a UMA zone to improve memory efficiency.
130 */
131 struct inotify_watch {
132 struct inotify_softc *sc; /* back-pointer */
133 int wd; /* unique ID */
134 uint32_t mask; /* event mask */
135 struct vnode *vp; /* vnode being watched, refed */
136 RB_ENTRY(inotify_watch) ilink; /* inotify linkage */
137 TAILQ_ENTRY(inotify_watch) vlink; /* vnode linkage */
138 };
139
140 static void
inotify_init(void * arg __unused)141 inotify_init(void *arg __unused)
142 {
143 /* Don't let a user hold too many vnodes. */
144 inotify_max_user_watches = desiredvnodes / 3;
145 /* Don't let the system hold too many vnodes. */
146 inotify_max_watches = desiredvnodes / 2;
147 }
148 SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
149
150 static int
inotify_watch_cmp(const struct inotify_watch * a,const struct inotify_watch * b)151 inotify_watch_cmp(const struct inotify_watch *a,
152 const struct inotify_watch *b)
153 {
154 if (a->wd < b->wd)
155 return (-1);
156 else if (a->wd > b->wd)
157 return (1);
158 else
159 return (0);
160 }
161 RB_HEAD(inotify_watch_tree, inotify_watch);
162 RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
163
164 struct inotify_softc {
165 struct mtx lock; /* serialize all softc writes */
166 STAILQ_HEAD(, inotify_record) pending; /* events waiting to be read */
167 struct inotify_record overflow; /* preallocated record */
168 int nextwatch; /* next watch ID to try */
169 int npending; /* number of pending events */
170 size_t nbpending; /* bytes available to read */
171 uint64_t ino; /* unique identifier */
172 struct inotify_watch_tree watches; /* active watches */
173 TAILQ_HEAD(, inotify_watch) deadwatches; /* watches pending vrele() */
174 struct task reaptask; /* task to reap dead watches */
175 struct selinfo sel; /* select/poll/kevent info */
176 struct ucred *cred; /* credential ref */
177 };
178
179 static struct inotify_record *
inotify_dequeue(struct inotify_softc * sc)180 inotify_dequeue(struct inotify_softc *sc)
181 {
182 struct inotify_record *rec;
183
184 mtx_assert(&sc->lock, MA_OWNED);
185 KASSERT(!STAILQ_EMPTY(&sc->pending),
186 ("%s: queue for %p is empty", __func__, sc));
187
188 rec = STAILQ_FIRST(&sc->pending);
189 STAILQ_REMOVE_HEAD(&sc->pending, link);
190 sc->npending--;
191 sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
192 return (rec);
193 }
194
195 static void
inotify_enqueue(struct inotify_softc * sc,struct inotify_record * rec,bool head)196 inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head)
197 {
198 mtx_assert(&sc->lock, MA_OWNED);
199
200 if (head)
201 STAILQ_INSERT_HEAD(&sc->pending, rec, link);
202 else
203 STAILQ_INSERT_TAIL(&sc->pending, rec, link);
204 sc->npending++;
205 sc->nbpending += sizeof(rec->ev) + rec->ev.len;
206 }
207
208 static int
inotify_read(struct file * fp,struct uio * uio,struct ucred * cred,int flags,struct thread * td)209 inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
210 struct thread *td)
211 {
212 struct inotify_softc *sc;
213 struct inotify_record *rec;
214 int error;
215 bool first;
216
217 sc = fp->f_data;
218 error = 0;
219
220 mtx_lock(&sc->lock);
221 while (STAILQ_EMPTY(&sc->pending)) {
222 if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
223 mtx_unlock(&sc->lock);
224 return (EWOULDBLOCK);
225 }
226 error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
227 if (error != 0) {
228 mtx_unlock(&sc->lock);
229 return (error);
230 }
231 }
232 for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
233 size_t len;
234
235 rec = inotify_dequeue(sc);
236 len = sizeof(rec->ev) + rec->ev.len;
237 if (uio->uio_resid < (ssize_t)len) {
238 inotify_enqueue(sc, rec, true);
239 if (first) {
240 error = EXTERROR(EINVAL,
241 "read buffer is too small");
242 }
243 break;
244 }
245 mtx_unlock(&sc->lock);
246 error = uiomove(&rec->ev, len, uio);
247 #ifdef KTRACE
248 if (error == 0 && KTRPOINT(td, KTR_STRUCT))
249 ktrstruct("inotify", &rec->ev, len);
250 #endif
251 mtx_lock(&sc->lock);
252 if (error != 0) {
253 inotify_enqueue(sc, rec, true);
254 mtx_unlock(&sc->lock);
255 return (error);
256 }
257 if (rec == &sc->overflow) {
258 /*
259 * Signal to inotify_queue_record() that the overflow
260 * record can be reused.
261 */
262 memset(rec, 0, sizeof(*rec));
263 } else {
264 free(rec, M_INOTIFY);
265 }
266 }
267 mtx_unlock(&sc->lock);
268 return (error);
269 }
270
271 static int
inotify_ioctl(struct file * fp,u_long com,void * data,struct ucred * cred,struct thread * td)272 inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
273 struct thread *td)
274 {
275 struct inotify_softc *sc;
276
277 sc = fp->f_data;
278
279 switch (com) {
280 case FIONREAD:
281 *(int *)data = (int)sc->nbpending;
282 return (0);
283 case FIONBIO:
284 case FIOASYNC:
285 return (0);
286 default:
287 return (ENOTTY);
288 }
289
290 return (0);
291 }
292
293 static int
inotify_poll(struct file * fp,int events,struct ucred * cred,struct thread * td)294 inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
295 {
296 struct inotify_softc *sc;
297 int revents;
298
299 sc = fp->f_data;
300 revents = 0;
301
302 mtx_lock(&sc->lock);
303 if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
304 revents |= events & (POLLIN | POLLRDNORM);
305 else
306 selrecord(td, &sc->sel);
307 mtx_unlock(&sc->lock);
308 return (revents);
309 }
310
311 static void
filt_inotifydetach(struct knote * kn)312 filt_inotifydetach(struct knote *kn)
313 {
314 struct inotify_softc *sc;
315
316 sc = kn->kn_hook;
317 knlist_remove(&sc->sel.si_note, kn, 0);
318 }
319
320 static int
filt_inotifyevent(struct knote * kn,long hint)321 filt_inotifyevent(struct knote *kn, long hint)
322 {
323 struct inotify_softc *sc;
324
325 sc = kn->kn_hook;
326 mtx_assert(&sc->lock, MA_OWNED);
327 kn->kn_data = sc->nbpending;
328 return (kn->kn_data > 0);
329 }
330
331 static int
inotify_kqfilter(struct file * fp,struct knote * kn)332 inotify_kqfilter(struct file *fp, struct knote *kn)
333 {
334 struct inotify_softc *sc;
335
336 if (kn->kn_filter != EVFILT_READ)
337 return (EINVAL);
338 sc = fp->f_data;
339 kn->kn_fop = &inotify_rfiltops;
340 kn->kn_hook = sc;
341 knlist_add(&sc->sel.si_note, kn, 0);
342 return (0);
343 }
344
345 static int
inotify_stat(struct file * fp,struct stat * sb,struct ucred * cred)346 inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
347 {
348 struct inotify_softc *sc;
349
350 sc = fp->f_data;
351
352 memset(sb, 0, sizeof(*sb));
353 sb->st_mode = S_IFREG | S_IRUSR;
354 sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
355 mtx_lock(&sc->lock);
356 sb->st_size = sc->nbpending;
357 sb->st_blocks = sc->npending;
358 sb->st_uid = sc->cred->cr_ruid;
359 sb->st_gid = sc->cred->cr_rgid;
360 sb->st_ino = sc->ino;
361 mtx_unlock(&sc->lock);
362 return (0);
363 }
364
365 static void
inotify_unlink_watch_locked(struct inotify_softc * sc,struct inotify_watch * watch)366 inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch)
367 {
368 struct vnode *vp;
369
370 vp = watch->vp;
371 mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
372
373 atomic_subtract_int(&inotify_watches, 1);
374 (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
375
376 TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
377 if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
378 vn_irflag_unset(vp, VIRF_INOTIFY);
379 }
380
381 static void
inotify_free_watch(struct inotify_watch * watch)382 inotify_free_watch(struct inotify_watch *watch)
383 {
384 /*
385 * Formally, we don't need to lock the vnode here. However, if we
386 * don't, and vrele() releases the last reference, it's possible the
387 * vnode will be recycled while a different thread holds the vnode lock.
388 * Work around this bug by acquiring the lock here.
389 */
390 (void)vn_lock(watch->vp, LK_EXCLUSIVE | LK_RETRY);
391 vput(watch->vp);
392 free(watch, M_INOTIFY);
393 }
394
395 /*
396 * Assumes that the watch has already been removed from its softc.
397 */
398 static void
inotify_remove_watch(struct inotify_watch * watch)399 inotify_remove_watch(struct inotify_watch *watch)
400 {
401 struct inotify_softc *sc;
402 struct vnode *vp;
403
404 sc = watch->sc;
405
406 vp = watch->vp;
407 mtx_lock(&vp->v_pollinfo->vpi_lock);
408 inotify_unlink_watch_locked(sc, watch);
409 mtx_unlock(&vp->v_pollinfo->vpi_lock);
410 inotify_free_watch(watch);
411 }
412
413 static void
inotify_reap(void * arg,int pending)414 inotify_reap(void *arg, int pending)
415 {
416 struct inotify_softc *sc;
417 struct inotify_watch *watch;
418
419 sc = arg;
420 mtx_lock(&sc->lock);
421 while ((watch = TAILQ_FIRST(&sc->deadwatches)) != NULL) {
422 TAILQ_REMOVE(&sc->deadwatches, watch, vlink);
423 mtx_unlock(&sc->lock);
424 inotify_free_watch(watch);
425 mtx_lock(&sc->lock);
426 }
427 mtx_unlock(&sc->lock);
428 }
429
430 static int
inotify_close(struct file * fp,struct thread * td)431 inotify_close(struct file *fp, struct thread *td)
432 {
433 struct inotify_softc *sc;
434 struct inotify_record *rec;
435 struct inotify_watch *watch;
436
437 sc = fp->f_data;
438
439 /* Detach watches from their vnodes. */
440 mtx_lock(&sc->lock);
441 (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
442 while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
443 RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
444 mtx_unlock(&sc->lock);
445 inotify_remove_watch(watch);
446 mtx_lock(&sc->lock);
447 }
448
449 /* Make sure that any asynchronous vrele() calls are done. */
450 mtx_unlock(&sc->lock);
451 taskqueue_drain(taskqueue_thread, &sc->reaptask);
452 mtx_lock(&sc->lock);
453 KASSERT(RB_EMPTY(&sc->watches),
454 ("%s: watches not empty in %p", __func__, sc));
455 KASSERT(TAILQ_EMPTY(&sc->deadwatches),
456 ("%s: deadwatches not empty in %p", __func__, sc));
457
458 /* Drop pending events. */
459 while (!STAILQ_EMPTY(&sc->pending)) {
460 rec = inotify_dequeue(sc);
461 if (rec != &sc->overflow)
462 free(rec, M_INOTIFY);
463 }
464 mtx_unlock(&sc->lock);
465 seldrain(&sc->sel);
466 knlist_destroy(&sc->sel.si_note);
467 mtx_destroy(&sc->lock);
468 crfree(sc->cred);
469 free(sc, M_INOTIFY);
470 return (0);
471 }
472
473 static int
inotify_fill_kinfo(struct file * fp,struct kinfo_file * kif,struct filedesc * fdp)474 inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
475 struct filedesc *fdp)
476 {
477 struct inotify_softc *sc;
478
479 sc = fp->f_data;
480
481 kif->kf_type = KF_TYPE_INOTIFY;
482 kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
483 kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
484 return (0);
485 }
486
487 int
inotify_create_file(struct thread * td,struct file * fp,int flags,int * fflagsp)488 inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp)
489 {
490 struct inotify_softc *sc;
491 int fflags;
492
493 if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
494 return (EINVAL);
495
496 if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
497 inotify_max_user_instances))
498 return (EMFILE);
499
500 sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
501 sc->nextwatch = 1; /* Required for compatibility. */
502 STAILQ_INIT(&sc->pending);
503 RB_INIT(&sc->watches);
504 TAILQ_INIT(&sc->deadwatches);
505 TASK_INIT(&sc->reaptask, 0, inotify_reap, sc);
506 mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
507 knlist_init_mtx(&sc->sel.si_note, &sc->lock);
508 sc->cred = crhold(td->td_ucred);
509 sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
510
511 fflags = FREAD;
512 if ((flags & IN_NONBLOCK) != 0)
513 fflags |= FNONBLOCK;
514 if ((flags & IN_CLOEXEC) != 0)
515 *fflagsp |= O_CLOEXEC;
516 finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
517
518 return (0);
519 }
520
521 static struct inotify_record *
inotify_alloc_record(uint32_t wd,const char * name,size_t namelen,int event,uint32_t cookie,int waitok)522 inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
523 uint32_t cookie, int waitok)
524 {
525 struct inotify_event *evp;
526 struct inotify_record *rec;
527
528 rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
529 waitok | M_ZERO);
530 if (rec == NULL)
531 return (NULL);
532 evp = &rec->ev;
533 evp->wd = wd;
534 evp->mask = event;
535 evp->cookie = cookie;
536 evp->len = _IN_NAMESIZE(namelen);
537 if (name != NULL)
538 memcpy(evp->name, name, namelen);
539 return (rec);
540 }
541
542 static bool
inotify_can_coalesce(struct inotify_softc * sc,struct inotify_event * evp)543 inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp)
544 {
545 struct inotify_record *prev;
546
547 mtx_assert(&sc->lock, MA_OWNED);
548
549 prev = STAILQ_LAST(&sc->pending, inotify_record, link);
550 return (prev != NULL && prev->ev.mask == evp->mask &&
551 prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie &&
552 prev->ev.len == evp->len &&
553 memcmp(prev->ev.name, evp->name, evp->len) == 0);
554 }
555
556 static void
inotify_overflow_event(struct inotify_event * evp)557 inotify_overflow_event(struct inotify_event *evp)
558 {
559 evp->mask = IN_Q_OVERFLOW;
560 evp->wd = -1;
561 evp->cookie = 0;
562 evp->len = 0;
563 }
564
565 /*
566 * Put an event record on the queue for an inotify desscriptor. Return false if
567 * the record was not enqueued for some reason, true otherwise.
568 */
569 static bool
inotify_queue_record(struct inotify_softc * sc,struct inotify_record * rec)570 inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec)
571 {
572 struct inotify_event *evp;
573
574 mtx_assert(&sc->lock, MA_OWNED);
575
576 evp = &rec->ev;
577 if (__predict_false(rec == &sc->overflow)) {
578 /*
579 * Is the overflow record already in the queue? If so, there's
580 * not much else we can do: we're here because a kernel memory
581 * shortage prevented new record allocations.
582 */
583 counter_u64_add(inotify_event_drops, 1);
584 if (evp->mask == IN_Q_OVERFLOW)
585 return (false);
586 inotify_overflow_event(evp);
587 } else {
588 /* Try to coalesce duplicate events. */
589 if (inotify_coalesce && inotify_can_coalesce(sc, evp))
590 return (false);
591
592 /*
593 * Would this one overflow the queue? If so, convert it to an
594 * overflow event and try again to coalesce.
595 */
596 if (sc->npending >= inotify_max_queued_events) {
597 counter_u64_add(inotify_event_drops, 1);
598 inotify_overflow_event(evp);
599 if (inotify_can_coalesce(sc, evp))
600 return (false);
601 }
602 }
603 inotify_enqueue(sc, rec, false);
604 selwakeup(&sc->sel);
605 KNOTE_LOCKED(&sc->sel.si_note, 0);
606 wakeup(&sc->pending);
607 return (true);
608 }
609
610 static void
inotify_log_one(struct inotify_watch * watch,const char * name,size_t namelen,int event,uint32_t cookie)611 inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen,
612 int event, uint32_t cookie)
613 {
614 struct inotify_watch key;
615 struct inotify_softc *sc;
616 struct inotify_record *rec;
617 bool allocfail;
618
619 mtx_assert(&watch->vp->v_pollinfo->vpi_lock, MA_OWNED);
620
621 sc = watch->sc;
622 rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
623 M_NOWAIT);
624 if (rec == NULL) {
625 rec = &sc->overflow;
626 allocfail = true;
627 } else {
628 allocfail = false;
629 }
630
631 mtx_lock(&sc->lock);
632 if (!inotify_queue_record(sc, rec) && rec != &sc->overflow)
633 free(rec, M_INOTIFY);
634 if ((watch->mask & IN_ONESHOT) != 0 ||
635 (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) {
636 if (!allocfail) {
637 rec = inotify_alloc_record(watch->wd, NULL, 0,
638 IN_IGNORED, 0, M_NOWAIT);
639 if (rec == NULL)
640 rec = &sc->overflow;
641 if (!inotify_queue_record(sc, rec) &&
642 rec != &sc->overflow)
643 free(rec, M_INOTIFY);
644 }
645
646 /*
647 * Remove the watch, taking care to handle races with
648 * inotify_close(). The thread that removes the watch is
649 * responsible for freeing it.
650 */
651 key.wd = watch->wd;
652 if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
653 RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
654 inotify_unlink_watch_locked(sc, watch);
655
656 /*
657 * Defer the vrele() to a sleepable thread context.
658 */
659 TAILQ_INSERT_TAIL(&sc->deadwatches, watch, vlink);
660 taskqueue_enqueue(taskqueue_thread, &sc->reaptask);
661 }
662 }
663 mtx_unlock(&sc->lock);
664 }
665
666 void
inotify_log(struct vnode * vp,const char * name,size_t namelen,int event,uint32_t cookie)667 inotify_log(struct vnode *vp, const char *name, size_t namelen, int event,
668 uint32_t cookie)
669 {
670 struct inotify_watch *watch, *tmp;
671
672 KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0,
673 ("inotify_log: invalid event %#x", event));
674
675 mtx_lock(&vp->v_pollinfo->vpi_lock);
676 TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
677 KASSERT(watch->vp == vp,
678 ("inotify_log: watch %p vp != vp", watch));
679 if ((watch->mask & event) != 0 || event == IN_UNMOUNT)
680 inotify_log_one(watch, name, namelen, event, cookie);
681 }
682 mtx_unlock(&vp->v_pollinfo->vpi_lock);
683 }
684
685 /*
686 * An inotify event occurred on a watched vnode.
687 */
688 void
vn_inotify(struct vnode * vp,struct vnode * dvp,struct componentname * cnp,int event,uint32_t cookie)689 vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
690 int event, uint32_t cookie)
691 {
692 int isdir;
693
694 VNPASS(vp->v_holdcnt > 0, vp);
695
696 isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
697
698 if (dvp != NULL) {
699 VNPASS(dvp->v_holdcnt > 0, dvp);
700
701 /*
702 * Should we log an event for the vnode itself?
703 */
704 if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
705 int selfevent;
706
707 switch (event) {
708 case _IN_MOVE_DELETE:
709 case IN_DELETE:
710 /*
711 * IN_DELETE_SELF is only generated when the
712 * last hard link of a file is removed.
713 */
714 selfevent = IN_DELETE_SELF;
715 if (vp->v_type != VDIR) {
716 struct vattr va;
717 int error;
718
719 error = VOP_GETATTR(vp, &va,
720 cnp->cn_cred);
721 if (error == 0 && va.va_nlink != 0)
722 selfevent = 0;
723 }
724 break;
725 case IN_MOVED_FROM:
726 cookie = 0;
727 selfevent = IN_MOVE_SELF;
728 break;
729 case _IN_ATTRIB_LINKCOUNT:
730 selfevent = IN_ATTRIB;
731 break;
732 default:
733 selfevent = event;
734 break;
735 }
736
737 if ((selfevent & ~_IN_DIR_EVENTS) != 0) {
738 inotify_log(vp, NULL, 0, selfevent | isdir,
739 cookie);
740 }
741 }
742
743 /*
744 * Something is watching the directory through which this vnode
745 * was referenced, so we may need to log the event.
746 */
747 if ((event & IN_ALL_EVENTS) != 0 &&
748 (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
749 inotify_log(dvp, cnp->cn_nameptr,
750 cnp->cn_namelen, event | isdir, cookie);
751 }
752 } else {
753 /*
754 * We don't know which watched directory might contain the
755 * vnode, so we have to fall back to searching the name cache.
756 */
757 cache_vop_inotify(vp, event, cookie);
758 }
759 }
760
761 int
vn_inotify_add_watch(struct vnode * vp,struct inotify_softc * sc,uint32_t mask,uint32_t * wdp,struct thread * td)762 vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
763 uint32_t *wdp, struct thread *td)
764 {
765 struct inotify_watch *watch, *watch1;
766 uint32_t wd;
767
768 /*
769 * If this is a directory, make sure all of its entries are present in
770 * the name cache so that we're able to look them up if an event occurs.
771 * The persistent reference on the directory prevents the outgoing name
772 * cache entries from being reclaimed.
773 */
774 if (vp->v_type == VDIR) {
775 struct dirent *dp;
776 char *buf;
777 off_t off;
778 size_t buflen, len;
779 int eof, error;
780
781 buflen = 128 * sizeof(struct dirent);
782 buf = malloc(buflen, M_TEMP, M_WAITOK);
783
784 error = 0;
785 len = off = eof = 0;
786 for (;;) {
787 struct nameidata nd;
788
789 error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
790 &len, &off, &eof);
791 if (error != 0)
792 break;
793 if (len == 0)
794 /* Finished reading. */
795 break;
796 if (strcmp(dp->d_name, ".") == 0 ||
797 strcmp(dp->d_name, "..") == 0)
798 continue;
799
800 /*
801 * namei() consumes a reference on the starting
802 * directory if it's specified as a vnode.
803 */
804 vrefact(vp);
805 VOP_UNLOCK(vp);
806 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
807 dp->d_name, vp);
808 error = namei(&nd);
809 vn_lock(vp, LK_SHARED | LK_RETRY);
810 if (error != 0)
811 break;
812 NDFREE_PNBUF(&nd);
813 vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
814 vrele(nd.ni_vp);
815 }
816 free(buf, M_TEMP);
817 if (error != 0)
818 return (error);
819 }
820
821 /*
822 * The vnode referenced in kern_inotify_add_watch() might be different
823 * than this one if nullfs is in the picture.
824 */
825 vrefact(vp);
826 watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO);
827 watch->sc = sc;
828 watch->vp = vp;
829 watch->mask = mask;
830
831 /*
832 * Are we updating an existing watch? Search the vnode's list rather
833 * than that of the softc, as the former is likely to be shorter.
834 */
835 v_addpollinfo(vp);
836 mtx_lock(&vp->v_pollinfo->vpi_lock);
837 TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
838 if (watch1->sc == sc)
839 break;
840 }
841 mtx_lock(&sc->lock);
842 if (watch1 != NULL) {
843 mtx_unlock(&vp->v_pollinfo->vpi_lock);
844
845 /*
846 * We found an existing watch, update it based on our flags.
847 */
848 if ((mask & IN_MASK_CREATE) != 0) {
849 mtx_unlock(&sc->lock);
850 vrele(vp);
851 free(watch, M_INOTIFY);
852 return (EEXIST);
853 }
854 if ((mask & IN_MASK_ADD) != 0)
855 watch1->mask |= mask;
856 else
857 watch1->mask = mask;
858 *wdp = watch1->wd;
859 mtx_unlock(&sc->lock);
860 vrele(vp);
861 free(watch, M_INOTIFY);
862 return (EJUSTRETURN);
863 }
864
865 /*
866 * We're creating a new watch. Add it to the softc and vnode watch
867 * lists.
868 */
869 do {
870 struct inotify_watch key;
871
872 /*
873 * Search for the next available watch descriptor. This is
874 * implemented so as to avoid reusing watch descriptors for as
875 * long as possible.
876 */
877 key.wd = wd = sc->nextwatch++;
878 watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
879 } while (watch1 != NULL || wd == 0);
880 watch->wd = wd;
881 RB_INSERT(inotify_watch_tree, &sc->watches, watch);
882 TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
883 mtx_unlock(&sc->lock);
884 mtx_unlock(&vp->v_pollinfo->vpi_lock);
885 vn_irflag_set_cond(vp, VIRF_INOTIFY);
886
887 *wdp = wd;
888
889 return (0);
890 }
891
892 void
vn_inotify_revoke(struct vnode * vp)893 vn_inotify_revoke(struct vnode *vp)
894 {
895 if (vp->v_pollinfo == NULL) {
896 /* This is a nullfs vnode which shadows a watched vnode. */
897 return;
898 }
899 inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
900 }
901
902 static int
fget_inotify(struct thread * td,int fd,const cap_rights_t * needrightsp,struct file ** fpp)903 fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp,
904 struct file **fpp)
905 {
906 struct file *fp;
907 int error;
908
909 error = fget(td, fd, needrightsp, &fp);
910 if (error != 0)
911 return (error);
912 if (fp->f_type != DTYPE_INOTIFY) {
913 fdrop(fp, td);
914 return (EINVAL);
915 }
916 *fpp = fp;
917 return (0);
918 }
919
920 int
kern_inotify_add_watch(int fd,int dfd,const char * path,uint32_t mask,struct thread * td)921 kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
922 struct thread *td)
923 {
924 struct nameidata nd;
925 struct file *fp;
926 struct inotify_softc *sc;
927 struct vnode *vp;
928 uint32_t wd;
929 int count, error;
930
931 fp = NULL;
932 vp = NULL;
933
934 if ((mask & IN_ALL_EVENTS) == 0)
935 return (EXTERROR(EINVAL, "no events specified"));
936 if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) ==
937 (IN_MASK_ADD | IN_MASK_CREATE))
938 return (EXTERROR(EINVAL,
939 "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"));
940 if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0)
941 return (EXTERROR(EINVAL, "unrecognized flag"));
942
943 error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
944 if (error != 0)
945 return (error);
946 sc = fp->f_data;
947
948 NDINIT_AT(&nd, LOOKUP,
949 ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF |
950 LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd);
951 error = namei(&nd);
952 if (error != 0)
953 goto out;
954 NDFREE_PNBUF(&nd);
955 vp = nd.ni_vp;
956
957 error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
958 if (error != 0)
959 goto out;
960
961 if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
962 error = ENOTDIR;
963 goto out;
964 }
965
966 count = atomic_fetchadd_int(&inotify_watches, 1);
967 if (count > inotify_max_watches) {
968 atomic_subtract_int(&inotify_watches, 1);
969 error = ENOSPC;
970 goto out;
971 }
972 if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
973 inotify_max_user_watches)) {
974 atomic_subtract_int(&inotify_watches, 1);
975 error = ENOSPC;
976 goto out;
977 }
978 error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
979 if (error != 0) {
980 atomic_subtract_int(&inotify_watches, 1);
981 (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
982 if (error == EJUSTRETURN) {
983 /* We updated an existing watch, everything is ok. */
984 error = 0;
985 } else {
986 goto out;
987 }
988 }
989 td->td_retval[0] = wd;
990
991 out:
992 if (vp != NULL)
993 vput(vp);
994 fdrop(fp, td);
995 return (error);
996 }
997
998 int
sys_inotify_add_watch_at(struct thread * td,struct inotify_add_watch_at_args * uap)999 sys_inotify_add_watch_at(struct thread *td,
1000 struct inotify_add_watch_at_args *uap)
1001 {
1002 return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
1003 uap->mask, td));
1004 }
1005
1006 int
kern_inotify_rm_watch(int fd,uint32_t wd,struct thread * td)1007 kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
1008 {
1009 struct file *fp;
1010 struct inotify_softc *sc;
1011 struct inotify_record *rec;
1012 struct inotify_watch key, *watch;
1013 int error;
1014
1015 error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
1016 if (error != 0)
1017 return (error);
1018 sc = fp->f_data;
1019
1020 rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
1021
1022 /*
1023 * For compatibility with Linux, we do not remove pending events
1024 * associated with the watch. Watch descriptors are implemented so as
1025 * to avoid being reused for as long as possible, so one hopes that any
1026 * pending events from the removed watch descriptor will be removed
1027 * before the watch descriptor is recycled.
1028 */
1029 key.wd = wd;
1030 mtx_lock(&sc->lock);
1031 watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
1032 if (watch == NULL) {
1033 free(rec, M_INOTIFY);
1034 error = EINVAL;
1035 } else {
1036 RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
1037 if (!inotify_queue_record(sc, rec)) {
1038 free(rec, M_INOTIFY);
1039 error = 0;
1040 }
1041 }
1042 mtx_unlock(&sc->lock);
1043 if (watch != NULL)
1044 inotify_remove_watch(watch);
1045 fdrop(fp, td);
1046 return (error);
1047 }
1048
1049 int
sys_inotify_rm_watch(struct thread * td,struct inotify_rm_watch_args * uap)1050 sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap)
1051 {
1052 return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
1053 }
1054