1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2025 Klara, Inc.
5 */
6
7 #include "opt_ktrace.h"
8
9 #include <sys/param.h>
10 #include <sys/systm.h>
11 #include <sys/caprights.h>
12 #include <sys/counter.h>
13 #include <sys/dirent.h>
14 #define EXTERR_CATEGORY EXTERR_CAT_INOTIFY
15 #include <sys/exterrvar.h>
16 #include <sys/fcntl.h>
17 #include <sys/file.h>
18 #include <sys/filio.h>
19 #include <sys/inotify.h>
20 #include <sys/kernel.h>
21 #include <sys/lock.h>
22 #include <sys/ktrace.h>
23 #include <sys/malloc.h>
24 #include <sys/mutex.h>
25 #include <sys/namei.h>
26 #include <sys/poll.h>
27 #include <sys/proc.h>
28 #include <sys/queue.h>
29 #include <sys/resourcevar.h>
30 #include <sys/selinfo.h>
31 #include <sys/stat.h>
32 #include <sys/syscallsubr.h>
33 #include <sys/sysctl.h>
34 #include <sys/sysent.h>
35 #include <sys/syslimits.h>
36 #include <sys/sysproto.h>
37 #include <sys/taskqueue.h>
38 #include <sys/tree.h>
39 #include <sys/user.h>
40 #include <sys/vnode.h>
41
42 uint32_t inotify_rename_cookie;
43
44 static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
45 "inotify configuration");
46
47 static int inotify_max_queued_events = 16384;
48 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
49 &inotify_max_queued_events, 0,
50 "Maximum number of events to queue on an inotify descriptor");
51
52 static int inotify_max_user_instances = 256;
53 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
54 &inotify_max_user_instances, 0,
55 "Maximum number of inotify descriptors per user");
56
57 static int inotify_max_user_watches;
58 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
59 &inotify_max_user_watches, 0,
60 "Maximum number of inotify watches per user");
61
62 static int inotify_max_watches;
63 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
64 &inotify_max_watches, 0,
65 "Maximum number of inotify watches system-wide");
66
67 static int inotify_watches;
68 SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
69 &inotify_watches, 0,
70 "Total number of inotify watches currently in use");
71
72 static int inotify_coalesce = 1;
73 SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
74 &inotify_coalesce, 0,
75 "Coalesce inotify events when possible");
76
77 static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
78 SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
79 &inotify_event_drops,
80 "Number of inotify events dropped due to limits or allocation failures");
81
82 static fo_rdwr_t inotify_read;
83 static fo_ioctl_t inotify_ioctl;
84 static fo_poll_t inotify_poll;
85 static fo_kqfilter_t inotify_kqfilter;
86 static fo_stat_t inotify_stat;
87 static fo_close_t inotify_close;
88 static fo_fill_kinfo_t inotify_fill_kinfo;
89
90 static const struct fileops inotifyfdops = {
91 .fo_read = inotify_read,
92 .fo_write = invfo_rdwr,
93 .fo_truncate = invfo_truncate,
94 .fo_ioctl = inotify_ioctl,
95 .fo_poll = inotify_poll,
96 .fo_kqfilter = inotify_kqfilter,
97 .fo_stat = inotify_stat,
98 .fo_close = inotify_close,
99 .fo_chmod = invfo_chmod,
100 .fo_chown = invfo_chown,
101 .fo_sendfile = invfo_sendfile,
102 .fo_fill_kinfo = inotify_fill_kinfo,
103 .fo_cmp = file_kcmp_generic,
104 .fo_flags = DFLAG_PASSABLE,
105 };
106
107 static void filt_inotifydetach(struct knote *kn);
108 static int filt_inotifyevent(struct knote *kn, long hint);
109
110 static const struct filterops inotify_rfiltops = {
111 .f_isfd = 1,
112 .f_detach = filt_inotifydetach,
113 .f_event = filt_inotifyevent,
114 };
115
116 static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
117
118 struct inotify_record {
119 STAILQ_ENTRY(inotify_record) link;
120 struct inotify_event ev;
121 };
122
123 static uint64_t inotify_ino = 1;
124
125 /*
126 * On LP64 systems this occupies 64 bytes, so we don't get internal
127 * fragmentation by allocating watches with malloc(9). If the size changes,
128 * consider using a UMA zone to improve memory efficiency.
129 */
130 struct inotify_watch {
131 struct inotify_softc *sc; /* back-pointer */
132 int wd; /* unique ID */
133 uint32_t mask; /* event mask */
134 struct vnode *vp; /* vnode being watched, refed */
135 RB_ENTRY(inotify_watch) ilink; /* inotify linkage */
136 TAILQ_ENTRY(inotify_watch) vlink; /* vnode linkage */
137 };
138
139 static void
inotify_init(void * arg __unused)140 inotify_init(void *arg __unused)
141 {
142 /* Don't let a user hold too many vnodes. */
143 inotify_max_user_watches = desiredvnodes / 3;
144 /* Don't let the system hold too many vnodes. */
145 inotify_max_watches = desiredvnodes / 2;
146 }
147 SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
148
149 static int
inotify_watch_cmp(const struct inotify_watch * a,const struct inotify_watch * b)150 inotify_watch_cmp(const struct inotify_watch *a,
151 const struct inotify_watch *b)
152 {
153 if (a->wd < b->wd)
154 return (-1);
155 else if (a->wd > b->wd)
156 return (1);
157 else
158 return (0);
159 }
160 RB_HEAD(inotify_watch_tree, inotify_watch);
161 RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
162
163 struct inotify_softc {
164 struct mtx lock; /* serialize all softc writes */
165 STAILQ_HEAD(, inotify_record) pending; /* events waiting to be read */
166 struct inotify_record overflow; /* preallocated record */
167 int nextwatch; /* next watch ID to try */
168 int npending; /* number of pending events */
169 size_t nbpending; /* bytes available to read */
170 uint64_t ino; /* unique identifier */
171 struct inotify_watch_tree watches; /* active watches */
172 TAILQ_HEAD(, inotify_watch) deadwatches; /* watches pending vrele() */
173 struct task reaptask; /* task to reap dead watches */
174 struct selinfo sel; /* select/poll/kevent info */
175 struct ucred *cred; /* credential ref */
176 };
177
178 static struct inotify_record *
inotify_dequeue(struct inotify_softc * sc)179 inotify_dequeue(struct inotify_softc *sc)
180 {
181 struct inotify_record *rec;
182
183 mtx_assert(&sc->lock, MA_OWNED);
184 KASSERT(!STAILQ_EMPTY(&sc->pending),
185 ("%s: queue for %p is empty", __func__, sc));
186
187 rec = STAILQ_FIRST(&sc->pending);
188 STAILQ_REMOVE_HEAD(&sc->pending, link);
189 sc->npending--;
190 sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
191 return (rec);
192 }
193
194 static void
inotify_enqueue(struct inotify_softc * sc,struct inotify_record * rec,bool head)195 inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head)
196 {
197 mtx_assert(&sc->lock, MA_OWNED);
198
199 if (head)
200 STAILQ_INSERT_HEAD(&sc->pending, rec, link);
201 else
202 STAILQ_INSERT_TAIL(&sc->pending, rec, link);
203 sc->npending++;
204 sc->nbpending += sizeof(rec->ev) + rec->ev.len;
205 }
206
207 static int
inotify_read(struct file * fp,struct uio * uio,struct ucred * cred,int flags,struct thread * td)208 inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
209 struct thread *td)
210 {
211 struct inotify_softc *sc;
212 struct inotify_record *rec;
213 int error;
214 bool first;
215
216 sc = fp->f_data;
217 error = 0;
218
219 mtx_lock(&sc->lock);
220 while (STAILQ_EMPTY(&sc->pending)) {
221 if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
222 mtx_unlock(&sc->lock);
223 return (EWOULDBLOCK);
224 }
225 error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
226 if (error != 0) {
227 mtx_unlock(&sc->lock);
228 return (error);
229 }
230 }
231 for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
232 size_t len;
233
234 rec = inotify_dequeue(sc);
235 len = sizeof(rec->ev) + rec->ev.len;
236 if (uio->uio_resid < (ssize_t)len) {
237 inotify_enqueue(sc, rec, true);
238 if (first) {
239 error = EXTERROR(EINVAL,
240 "read buffer is too small");
241 }
242 break;
243 }
244 mtx_unlock(&sc->lock);
245 error = uiomove(&rec->ev, len, uio);
246 #ifdef KTRACE
247 if (error == 0 && KTRPOINT(td, KTR_STRUCT))
248 ktrstruct("inotify", &rec->ev, len);
249 #endif
250 mtx_lock(&sc->lock);
251 if (error != 0) {
252 inotify_enqueue(sc, rec, true);
253 mtx_unlock(&sc->lock);
254 return (error);
255 }
256 if (rec == &sc->overflow) {
257 /*
258 * Signal to inotify_queue_record() that the overflow
259 * record can be reused.
260 */
261 memset(rec, 0, sizeof(*rec));
262 } else {
263 free(rec, M_INOTIFY);
264 }
265 }
266 mtx_unlock(&sc->lock);
267 return (error);
268 }
269
270 static int
inotify_ioctl(struct file * fp,u_long com,void * data,struct ucred * cred,struct thread * td)271 inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
272 struct thread *td)
273 {
274 struct inotify_softc *sc;
275
276 sc = fp->f_data;
277
278 switch (com) {
279 case FIONREAD:
280 *(int *)data = (int)sc->nbpending;
281 return (0);
282 case FIONBIO:
283 case FIOASYNC:
284 return (0);
285 default:
286 return (ENOTTY);
287 }
288
289 return (0);
290 }
291
292 static int
inotify_poll(struct file * fp,int events,struct ucred * cred,struct thread * td)293 inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
294 {
295 struct inotify_softc *sc;
296 int revents;
297
298 sc = fp->f_data;
299 revents = 0;
300
301 mtx_lock(&sc->lock);
302 if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
303 revents |= events & (POLLIN | POLLRDNORM);
304 else
305 selrecord(td, &sc->sel);
306 mtx_unlock(&sc->lock);
307 return (revents);
308 }
309
310 static void
filt_inotifydetach(struct knote * kn)311 filt_inotifydetach(struct knote *kn)
312 {
313 struct inotify_softc *sc;
314
315 sc = kn->kn_hook;
316 knlist_remove(&sc->sel.si_note, kn, 0);
317 }
318
319 static int
filt_inotifyevent(struct knote * kn,long hint)320 filt_inotifyevent(struct knote *kn, long hint)
321 {
322 struct inotify_softc *sc;
323
324 sc = kn->kn_hook;
325 mtx_assert(&sc->lock, MA_OWNED);
326 kn->kn_data = sc->nbpending;
327 return (kn->kn_data > 0);
328 }
329
330 static int
inotify_kqfilter(struct file * fp,struct knote * kn)331 inotify_kqfilter(struct file *fp, struct knote *kn)
332 {
333 struct inotify_softc *sc;
334
335 if (kn->kn_filter != EVFILT_READ)
336 return (EINVAL);
337 sc = fp->f_data;
338 kn->kn_fop = &inotify_rfiltops;
339 kn->kn_hook = sc;
340 knlist_add(&sc->sel.si_note, kn, 0);
341 return (0);
342 }
343
344 static int
inotify_stat(struct file * fp,struct stat * sb,struct ucred * cred)345 inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
346 {
347 struct inotify_softc *sc;
348
349 sc = fp->f_data;
350
351 memset(sb, 0, sizeof(*sb));
352 sb->st_mode = S_IFREG | S_IRUSR;
353 sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
354 mtx_lock(&sc->lock);
355 sb->st_size = sc->nbpending;
356 sb->st_blocks = sc->npending;
357 sb->st_uid = sc->cred->cr_ruid;
358 sb->st_gid = sc->cred->cr_rgid;
359 sb->st_ino = sc->ino;
360 mtx_unlock(&sc->lock);
361 return (0);
362 }
363
364 static void
inotify_unlink_watch_locked(struct inotify_softc * sc,struct inotify_watch * watch)365 inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch)
366 {
367 struct vnode *vp;
368
369 vp = watch->vp;
370 mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
371
372 atomic_subtract_int(&inotify_watches, 1);
373 (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
374
375 TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
376 if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
377 vn_irflag_unset(vp, VIRF_INOTIFY);
378 }
379
380 static void
inotify_free_watch(struct inotify_watch * watch)381 inotify_free_watch(struct inotify_watch *watch)
382 {
383 vrele(watch->vp);
384 free(watch, M_INOTIFY);
385 }
386
387 /*
388 * Assumes that the watch has already been removed from its softc.
389 */
390 static void
inotify_remove_watch(struct inotify_watch * watch)391 inotify_remove_watch(struct inotify_watch *watch)
392 {
393 struct inotify_softc *sc;
394 struct vnode *vp;
395
396 sc = watch->sc;
397
398 vp = watch->vp;
399 mtx_lock(&vp->v_pollinfo->vpi_lock);
400 inotify_unlink_watch_locked(sc, watch);
401 mtx_unlock(&vp->v_pollinfo->vpi_lock);
402 inotify_free_watch(watch);
403 }
404
405 static void
inotify_reap(void * arg,int pending)406 inotify_reap(void *arg, int pending)
407 {
408 struct inotify_softc *sc;
409 struct inotify_watch *watch;
410
411 sc = arg;
412 mtx_lock(&sc->lock);
413 while ((watch = TAILQ_FIRST(&sc->deadwatches)) != NULL) {
414 TAILQ_REMOVE(&sc->deadwatches, watch, vlink);
415 mtx_unlock(&sc->lock);
416 inotify_free_watch(watch);
417 mtx_lock(&sc->lock);
418 }
419 mtx_unlock(&sc->lock);
420 }
421
422 static int
inotify_close(struct file * fp,struct thread * td)423 inotify_close(struct file *fp, struct thread *td)
424 {
425 struct inotify_softc *sc;
426 struct inotify_record *rec;
427 struct inotify_watch *watch;
428
429 sc = fp->f_data;
430
431 /* Detach watches from their vnodes. */
432 mtx_lock(&sc->lock);
433 (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
434 while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
435 RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
436 mtx_unlock(&sc->lock);
437 inotify_remove_watch(watch);
438 mtx_lock(&sc->lock);
439 }
440
441 /* Make sure that any asynchronous vrele() calls are done. */
442 mtx_unlock(&sc->lock);
443 taskqueue_drain(taskqueue_thread, &sc->reaptask);
444 mtx_lock(&sc->lock);
445 KASSERT(RB_EMPTY(&sc->watches),
446 ("%s: watches not empty in %p", __func__, sc));
447 KASSERT(TAILQ_EMPTY(&sc->deadwatches),
448 ("%s: deadwatches not empty in %p", __func__, sc));
449
450 /* Drop pending events. */
451 while (!STAILQ_EMPTY(&sc->pending)) {
452 rec = inotify_dequeue(sc);
453 if (rec != &sc->overflow)
454 free(rec, M_INOTIFY);
455 }
456 mtx_unlock(&sc->lock);
457 seldrain(&sc->sel);
458 knlist_destroy(&sc->sel.si_note);
459 mtx_destroy(&sc->lock);
460 crfree(sc->cred);
461 free(sc, M_INOTIFY);
462 return (0);
463 }
464
465 static int
inotify_fill_kinfo(struct file * fp,struct kinfo_file * kif,struct filedesc * fdp)466 inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
467 struct filedesc *fdp)
468 {
469 struct inotify_softc *sc;
470
471 sc = fp->f_data;
472
473 kif->kf_type = KF_TYPE_INOTIFY;
474 kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
475 kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
476 return (0);
477 }
478
479 int
inotify_create_file(struct thread * td,struct file * fp,int flags,int * fflagsp)480 inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp)
481 {
482 struct inotify_softc *sc;
483 int fflags;
484
485 if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
486 return (EINVAL);
487
488 if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
489 inotify_max_user_instances))
490 return (EMFILE);
491
492 sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
493 sc->nextwatch = 1; /* Required for compatibility. */
494 STAILQ_INIT(&sc->pending);
495 RB_INIT(&sc->watches);
496 TAILQ_INIT(&sc->deadwatches);
497 TASK_INIT(&sc->reaptask, 0, inotify_reap, sc);
498 mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
499 knlist_init_mtx(&sc->sel.si_note, &sc->lock);
500 sc->cred = crhold(td->td_ucred);
501 sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
502
503 fflags = FREAD;
504 if ((flags & IN_NONBLOCK) != 0)
505 fflags |= FNONBLOCK;
506 if ((flags & IN_CLOEXEC) != 0)
507 *fflagsp |= O_CLOEXEC;
508 finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
509
510 return (0);
511 }
512
513 static struct inotify_record *
inotify_alloc_record(uint32_t wd,const char * name,size_t namelen,int event,uint32_t cookie,int waitok)514 inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
515 uint32_t cookie, int waitok)
516 {
517 struct inotify_event *evp;
518 struct inotify_record *rec;
519
520 rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
521 waitok | M_ZERO);
522 if (rec == NULL)
523 return (NULL);
524 evp = &rec->ev;
525 evp->wd = wd;
526 evp->mask = event;
527 evp->cookie = cookie;
528 evp->len = _IN_NAMESIZE(namelen);
529 if (name != NULL)
530 memcpy(evp->name, name, namelen);
531 return (rec);
532 }
533
534 static bool
inotify_can_coalesce(struct inotify_softc * sc,struct inotify_event * evp)535 inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp)
536 {
537 struct inotify_record *prev;
538
539 mtx_assert(&sc->lock, MA_OWNED);
540
541 prev = STAILQ_LAST(&sc->pending, inotify_record, link);
542 return (prev != NULL && prev->ev.mask == evp->mask &&
543 prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie &&
544 prev->ev.len == evp->len &&
545 memcmp(prev->ev.name, evp->name, evp->len) == 0);
546 }
547
548 static void
inotify_overflow_event(struct inotify_event * evp)549 inotify_overflow_event(struct inotify_event *evp)
550 {
551 evp->mask = IN_Q_OVERFLOW;
552 evp->wd = -1;
553 evp->cookie = 0;
554 evp->len = 0;
555 }
556
557 /*
558 * Put an event record on the queue for an inotify desscriptor. Return false if
559 * the record was not enqueued for some reason, true otherwise.
560 */
561 static bool
inotify_queue_record(struct inotify_softc * sc,struct inotify_record * rec)562 inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec)
563 {
564 struct inotify_event *evp;
565
566 mtx_assert(&sc->lock, MA_OWNED);
567
568 evp = &rec->ev;
569 if (__predict_false(rec == &sc->overflow)) {
570 /*
571 * Is the overflow record already in the queue? If so, there's
572 * not much else we can do: we're here because a kernel memory
573 * shortage prevented new record allocations.
574 */
575 counter_u64_add(inotify_event_drops, 1);
576 if (evp->mask == IN_Q_OVERFLOW)
577 return (false);
578 inotify_overflow_event(evp);
579 } else {
580 /* Try to coalesce duplicate events. */
581 if (inotify_coalesce && inotify_can_coalesce(sc, evp))
582 return (false);
583
584 /*
585 * Would this one overflow the queue? If so, convert it to an
586 * overflow event and try again to coalesce.
587 */
588 if (sc->npending >= inotify_max_queued_events) {
589 counter_u64_add(inotify_event_drops, 1);
590 inotify_overflow_event(evp);
591 if (inotify_can_coalesce(sc, evp))
592 return (false);
593 }
594 }
595 inotify_enqueue(sc, rec, false);
596 selwakeup(&sc->sel);
597 KNOTE_LOCKED(&sc->sel.si_note, 0);
598 wakeup(&sc->pending);
599 return (true);
600 }
601
602 static void
inotify_log_one(struct inotify_watch * watch,const char * name,size_t namelen,int event,uint32_t cookie)603 inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen,
604 int event, uint32_t cookie)
605 {
606 struct inotify_watch key;
607 struct inotify_softc *sc;
608 struct inotify_record *rec;
609 bool allocfail;
610
611 mtx_assert(&watch->vp->v_pollinfo->vpi_lock, MA_OWNED);
612
613 sc = watch->sc;
614 rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
615 M_NOWAIT);
616 if (rec == NULL) {
617 rec = &sc->overflow;
618 allocfail = true;
619 } else {
620 allocfail = false;
621 }
622
623 mtx_lock(&sc->lock);
624 if (!inotify_queue_record(sc, rec) && rec != &sc->overflow)
625 free(rec, M_INOTIFY);
626 if ((watch->mask & IN_ONESHOT) != 0 ||
627 (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) {
628 if (!allocfail) {
629 rec = inotify_alloc_record(watch->wd, NULL, 0,
630 IN_IGNORED, 0, M_NOWAIT);
631 if (rec == NULL)
632 rec = &sc->overflow;
633 if (!inotify_queue_record(sc, rec) &&
634 rec != &sc->overflow)
635 free(rec, M_INOTIFY);
636 }
637
638 /*
639 * Remove the watch, taking care to handle races with
640 * inotify_close(). The thread that removes the watch is
641 * responsible for freeing it.
642 */
643 key.wd = watch->wd;
644 if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
645 RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
646 inotify_unlink_watch_locked(sc, watch);
647
648 /*
649 * Defer the vrele() to a sleepable thread context.
650 */
651 TAILQ_INSERT_TAIL(&sc->deadwatches, watch, vlink);
652 taskqueue_enqueue(taskqueue_thread, &sc->reaptask);
653 }
654 }
655 mtx_unlock(&sc->lock);
656 }
657
658 void
inotify_log(struct vnode * vp,const char * name,size_t namelen,int event,uint32_t cookie)659 inotify_log(struct vnode *vp, const char *name, size_t namelen, int event,
660 uint32_t cookie)
661 {
662 struct inotify_watch *watch, *tmp;
663
664 KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0,
665 ("inotify_log: invalid event %#x", event));
666
667 mtx_lock(&vp->v_pollinfo->vpi_lock);
668 TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
669 KASSERT(watch->vp == vp,
670 ("inotify_log: watch %p vp != vp", watch));
671 if ((watch->mask & event) != 0 || event == IN_UNMOUNT)
672 inotify_log_one(watch, name, namelen, event, cookie);
673 }
674 mtx_unlock(&vp->v_pollinfo->vpi_lock);
675 }
676
677 /*
678 * An inotify event occurred on a watched vnode.
679 */
680 void
vn_inotify(struct vnode * vp,struct vnode * dvp,struct componentname * cnp,int event,uint32_t cookie)681 vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp,
682 int event, uint32_t cookie)
683 {
684 int isdir;
685
686 VNPASS(vp->v_holdcnt > 0, vp);
687
688 isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
689
690 if (dvp != NULL) {
691 VNPASS(dvp->v_holdcnt > 0, dvp);
692
693 /*
694 * Should we log an event for the vnode itself?
695 */
696 if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
697 int selfevent;
698
699 switch (event) {
700 case _IN_MOVE_DELETE:
701 case IN_DELETE:
702 /*
703 * IN_DELETE_SELF is only generated when the
704 * last hard link of a file is removed.
705 */
706 selfevent = IN_DELETE_SELF;
707 if (vp->v_type != VDIR) {
708 struct vattr va;
709 int error;
710
711 error = VOP_GETATTR(vp, &va,
712 cnp->cn_cred);
713 if (error == 0 && va.va_nlink != 0)
714 selfevent = 0;
715 }
716 break;
717 case IN_MOVED_FROM:
718 cookie = 0;
719 selfevent = IN_MOVE_SELF;
720 break;
721 case _IN_ATTRIB_LINKCOUNT:
722 selfevent = IN_ATTRIB;
723 break;
724 default:
725 selfevent = event;
726 break;
727 }
728
729 if ((selfevent & ~_IN_DIR_EVENTS) != 0) {
730 inotify_log(vp, NULL, 0, selfevent | isdir,
731 cookie);
732 }
733 }
734
735 /*
736 * Something is watching the directory through which this vnode
737 * was referenced, so we may need to log the event.
738 */
739 if ((event & IN_ALL_EVENTS) != 0 &&
740 (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
741 inotify_log(dvp, cnp->cn_nameptr,
742 cnp->cn_namelen, event | isdir, cookie);
743 }
744 } else {
745 /*
746 * We don't know which watched directory might contain the
747 * vnode, so we have to fall back to searching the name cache.
748 */
749 cache_vop_inotify(vp, event, cookie);
750 }
751 }
752
753 int
vn_inotify_add_watch(struct vnode * vp,struct inotify_softc * sc,uint32_t mask,uint32_t * wdp,struct thread * td)754 vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask,
755 uint32_t *wdp, struct thread *td)
756 {
757 struct inotify_watch *watch, *watch1;
758 uint32_t wd;
759
760 /*
761 * If this is a directory, make sure all of its entries are present in
762 * the name cache so that we're able to look them up if an event occurs.
763 * The persistent reference on the directory prevents the outgoing name
764 * cache entries from being reclaimed.
765 */
766 if (vp->v_type == VDIR) {
767 struct dirent *dp;
768 char *buf;
769 off_t off;
770 size_t buflen, len;
771 int eof, error;
772
773 buflen = 128 * sizeof(struct dirent);
774 buf = malloc(buflen, M_TEMP, M_WAITOK);
775
776 error = 0;
777 len = off = eof = 0;
778 for (;;) {
779 struct nameidata nd;
780
781 error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
782 &len, &off, &eof);
783 if (error != 0)
784 break;
785 if (len == 0)
786 /* Finished reading. */
787 break;
788 if (strcmp(dp->d_name, ".") == 0 ||
789 strcmp(dp->d_name, "..") == 0)
790 continue;
791
792 /*
793 * namei() consumes a reference on the starting
794 * directory if it's specified as a vnode.
795 */
796 vrefact(vp);
797 VOP_UNLOCK(vp);
798 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
799 dp->d_name, vp);
800 error = namei(&nd);
801 vn_lock(vp, LK_SHARED | LK_RETRY);
802 if (error != 0)
803 break;
804 NDFREE_PNBUF(&nd);
805 vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
806 vrele(nd.ni_vp);
807 }
808 free(buf, M_TEMP);
809 if (error != 0)
810 return (error);
811 }
812
813 /*
814 * The vnode referenced in kern_inotify_add_watch() might be different
815 * than this one if nullfs is in the picture.
816 */
817 vrefact(vp);
818 watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO);
819 watch->sc = sc;
820 watch->vp = vp;
821 watch->mask = mask;
822
823 /*
824 * Are we updating an existing watch? Search the vnode's list rather
825 * than that of the softc, as the former is likely to be shorter.
826 */
827 v_addpollinfo(vp);
828 mtx_lock(&vp->v_pollinfo->vpi_lock);
829 TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
830 if (watch1->sc == sc)
831 break;
832 }
833 mtx_lock(&sc->lock);
834 if (watch1 != NULL) {
835 mtx_unlock(&vp->v_pollinfo->vpi_lock);
836
837 /*
838 * We found an existing watch, update it based on our flags.
839 */
840 if ((mask & IN_MASK_CREATE) != 0) {
841 mtx_unlock(&sc->lock);
842 vrele(vp);
843 free(watch, M_INOTIFY);
844 return (EEXIST);
845 }
846 if ((mask & IN_MASK_ADD) != 0)
847 watch1->mask |= mask;
848 else
849 watch1->mask = mask;
850 *wdp = watch1->wd;
851 mtx_unlock(&sc->lock);
852 vrele(vp);
853 free(watch, M_INOTIFY);
854 return (EJUSTRETURN);
855 }
856
857 /*
858 * We're creating a new watch. Add it to the softc and vnode watch
859 * lists.
860 */
861 do {
862 struct inotify_watch key;
863
864 /*
865 * Search for the next available watch descriptor. This is
866 * implemented so as to avoid reusing watch descriptors for as
867 * long as possible.
868 */
869 key.wd = wd = sc->nextwatch++;
870 watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
871 } while (watch1 != NULL || wd == 0);
872 watch->wd = wd;
873 RB_INSERT(inotify_watch_tree, &sc->watches, watch);
874 TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
875 mtx_unlock(&sc->lock);
876 mtx_unlock(&vp->v_pollinfo->vpi_lock);
877 vn_irflag_set_cond(vp, VIRF_INOTIFY);
878
879 *wdp = wd;
880
881 return (0);
882 }
883
884 void
vn_inotify_revoke(struct vnode * vp)885 vn_inotify_revoke(struct vnode *vp)
886 {
887 if (vp->v_pollinfo == NULL) {
888 /* This is a nullfs vnode which shadows a watched vnode. */
889 return;
890 }
891 inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
892 }
893
894 static int
fget_inotify(struct thread * td,int fd,const cap_rights_t * needrightsp,struct file ** fpp)895 fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp,
896 struct file **fpp)
897 {
898 struct file *fp;
899 int error;
900
901 error = fget(td, fd, needrightsp, &fp);
902 if (error != 0)
903 return (error);
904 if (fp->f_type != DTYPE_INOTIFY) {
905 fdrop(fp, td);
906 return (EINVAL);
907 }
908 *fpp = fp;
909 return (0);
910 }
911
912 int
kern_inotify_add_watch(int fd,int dfd,const char * path,uint32_t mask,struct thread * td)913 kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
914 struct thread *td)
915 {
916 struct nameidata nd;
917 struct file *fp;
918 struct inotify_softc *sc;
919 struct vnode *vp;
920 uint32_t wd;
921 int count, error;
922
923 fp = NULL;
924 vp = NULL;
925
926 if ((mask & IN_ALL_EVENTS) == 0)
927 return (EXTERROR(EINVAL, "no events specified"));
928 if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) ==
929 (IN_MASK_ADD | IN_MASK_CREATE))
930 return (EXTERROR(EINVAL,
931 "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"));
932 if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0)
933 return (EXTERROR(EINVAL, "unrecognized flag"));
934
935 error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
936 if (error != 0)
937 return (error);
938 sc = fp->f_data;
939
940 NDINIT_AT(&nd, LOOKUP,
941 ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF |
942 LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd);
943 error = namei(&nd);
944 if (error != 0)
945 goto out;
946 NDFREE_PNBUF(&nd);
947 vp = nd.ni_vp;
948
949 error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
950 if (error != 0)
951 goto out;
952
953 if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
954 error = ENOTDIR;
955 goto out;
956 }
957
958 count = atomic_fetchadd_int(&inotify_watches, 1);
959 if (count > inotify_max_watches) {
960 atomic_subtract_int(&inotify_watches, 1);
961 error = ENOSPC;
962 goto out;
963 }
964 if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
965 inotify_max_user_watches)) {
966 atomic_subtract_int(&inotify_watches, 1);
967 error = ENOSPC;
968 goto out;
969 }
970 error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
971 if (error != 0) {
972 atomic_subtract_int(&inotify_watches, 1);
973 (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
974 if (error == EJUSTRETURN) {
975 /* We updated an existing watch, everything is ok. */
976 error = 0;
977 } else {
978 goto out;
979 }
980 }
981 td->td_retval[0] = wd;
982
983 out:
984 if (vp != NULL)
985 vput(vp);
986 fdrop(fp, td);
987 return (error);
988 }
989
990 int
sys_inotify_add_watch_at(struct thread * td,struct inotify_add_watch_at_args * uap)991 sys_inotify_add_watch_at(struct thread *td,
992 struct inotify_add_watch_at_args *uap)
993 {
994 return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
995 uap->mask, td));
996 }
997
998 int
kern_inotify_rm_watch(int fd,uint32_t wd,struct thread * td)999 kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
1000 {
1001 struct file *fp;
1002 struct inotify_softc *sc;
1003 struct inotify_record *rec;
1004 struct inotify_watch key, *watch;
1005 int error;
1006
1007 error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
1008 if (error != 0)
1009 return (error);
1010 sc = fp->f_data;
1011
1012 rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
1013
1014 /*
1015 * For compatibility with Linux, we do not remove pending events
1016 * associated with the watch. Watch descriptors are implemented so as
1017 * to avoid being reused for as long as possible, so one hopes that any
1018 * pending events from the removed watch descriptor will be removed
1019 * before the watch descriptor is recycled.
1020 */
1021 key.wd = wd;
1022 mtx_lock(&sc->lock);
1023 watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
1024 if (watch == NULL) {
1025 free(rec, M_INOTIFY);
1026 error = EINVAL;
1027 } else {
1028 RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
1029 if (!inotify_queue_record(sc, rec)) {
1030 free(rec, M_INOTIFY);
1031 error = 0;
1032 }
1033 }
1034 mtx_unlock(&sc->lock);
1035 if (watch != NULL)
1036 inotify_remove_watch(watch);
1037 fdrop(fp, td);
1038 return (error);
1039 }
1040
1041 int
sys_inotify_rm_watch(struct thread * td,struct inotify_rm_watch_args * uap)1042 sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap)
1043 {
1044 return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
1045 }
1046