1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2025 Klara, Inc. 5 */ 6 7 #include "opt_ktrace.h" 8 9 #include <sys/param.h> 10 #include <sys/systm.h> 11 #include <sys/caprights.h> 12 #include <sys/counter.h> 13 #include <sys/dirent.h> 14 #define EXTERR_CATEGORY EXTERR_CAT_INOTIFY 15 #include <sys/exterrvar.h> 16 #include <sys/fcntl.h> 17 #include <sys/file.h> 18 #include <sys/filio.h> 19 #include <sys/inotify.h> 20 #include <sys/kernel.h> 21 #include <sys/lock.h> 22 #include <sys/ktrace.h> 23 #include <sys/malloc.h> 24 #include <sys/mutex.h> 25 #include <sys/namei.h> 26 #include <sys/poll.h> 27 #include <sys/proc.h> 28 #include <sys/queue.h> 29 #include <sys/resourcevar.h> 30 #include <sys/selinfo.h> 31 #include <sys/stat.h> 32 #include <sys/syscallsubr.h> 33 #include <sys/sysctl.h> 34 #include <sys/sysent.h> 35 #include <sys/syslimits.h> 36 #include <sys/sysproto.h> 37 #include <sys/taskqueue.h> 38 #include <sys/tree.h> 39 #include <sys/user.h> 40 #include <sys/vnode.h> 41 42 uint32_t inotify_rename_cookie; 43 44 static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 45 "inotify configuration"); 46 47 static int inotify_max_queued_events = 16384; 48 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN, 49 &inotify_max_queued_events, 0, 50 "Maximum number of events to queue on an inotify descriptor"); 51 52 static int inotify_max_user_instances = 256; 53 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN, 54 &inotify_max_user_instances, 0, 55 "Maximum number of inotify descriptors per user"); 56 57 static int inotify_max_user_watches; 58 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN, 59 &inotify_max_user_watches, 0, 60 "Maximum number of inotify watches per user"); 61 62 static int inotify_max_watches; 63 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN, 64 &inotify_max_watches, 0, 65 "Maximum number of inotify watches system-wide"); 66 67 static int inotify_watches; 68 SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD, 69 &inotify_watches, 0, 70 "Total number of inotify watches currently in use"); 71 72 static int inotify_coalesce = 1; 73 SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN, 74 &inotify_coalesce, 0, 75 "Coalesce inotify events when possible"); 76 77 static COUNTER_U64_DEFINE_EARLY(inotify_event_drops); 78 SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD, 79 &inotify_event_drops, 80 "Number of inotify events dropped due to limits or allocation failures"); 81 82 static fo_rdwr_t inotify_read; 83 static fo_ioctl_t inotify_ioctl; 84 static fo_poll_t inotify_poll; 85 static fo_kqfilter_t inotify_kqfilter; 86 static fo_stat_t inotify_stat; 87 static fo_close_t inotify_close; 88 static fo_fill_kinfo_t inotify_fill_kinfo; 89 90 static const struct fileops inotifyfdops = { 91 .fo_read = inotify_read, 92 .fo_write = invfo_rdwr, 93 .fo_truncate = invfo_truncate, 94 .fo_ioctl = inotify_ioctl, 95 .fo_poll = inotify_poll, 96 .fo_kqfilter = inotify_kqfilter, 97 .fo_stat = inotify_stat, 98 .fo_close = inotify_close, 99 .fo_chmod = invfo_chmod, 100 .fo_chown = invfo_chown, 101 .fo_sendfile = invfo_sendfile, 102 .fo_fill_kinfo = inotify_fill_kinfo, 103 .fo_cmp = file_kcmp_generic, 104 .fo_flags = DFLAG_PASSABLE, 105 }; 106 107 static void filt_inotifydetach(struct knote *kn); 108 static int filt_inotifyevent(struct knote *kn, long hint); 109 110 static const struct filterops inotify_rfiltops = { 111 .f_isfd = 1, 112 .f_detach = filt_inotifydetach, 113 .f_event = filt_inotifyevent, 114 .f_copy = knote_triv_copy, 115 }; 116 117 static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures"); 118 119 struct inotify_record { 120 STAILQ_ENTRY(inotify_record) link; 121 struct inotify_event ev; 122 }; 123 124 static uint64_t inotify_ino = 1; 125 126 /* 127 * On LP64 systems this occupies 64 bytes, so we don't get internal 128 * fragmentation by allocating watches with malloc(9). If the size changes, 129 * consider using a UMA zone to improve memory efficiency. 130 */ 131 struct inotify_watch { 132 struct inotify_softc *sc; /* back-pointer */ 133 int wd; /* unique ID */ 134 uint32_t mask; /* event mask */ 135 struct vnode *vp; /* vnode being watched, refed */ 136 RB_ENTRY(inotify_watch) ilink; /* inotify linkage */ 137 TAILQ_ENTRY(inotify_watch) vlink; /* vnode linkage */ 138 }; 139 140 static void 141 inotify_init(void *arg __unused) 142 { 143 /* Don't let a user hold too many vnodes. */ 144 inotify_max_user_watches = desiredvnodes / 3; 145 /* Don't let the system hold too many vnodes. */ 146 inotify_max_watches = desiredvnodes / 2; 147 } 148 SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL); 149 150 static int 151 inotify_watch_cmp(const struct inotify_watch *a, 152 const struct inotify_watch *b) 153 { 154 if (a->wd < b->wd) 155 return (-1); 156 else if (a->wd > b->wd) 157 return (1); 158 else 159 return (0); 160 } 161 RB_HEAD(inotify_watch_tree, inotify_watch); 162 RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp); 163 164 struct inotify_softc { 165 struct mtx lock; /* serialize all softc writes */ 166 STAILQ_HEAD(, inotify_record) pending; /* events waiting to be read */ 167 struct inotify_record overflow; /* preallocated record */ 168 int nextwatch; /* next watch ID to try */ 169 int npending; /* number of pending events */ 170 size_t nbpending; /* bytes available to read */ 171 uint64_t ino; /* unique identifier */ 172 struct inotify_watch_tree watches; /* active watches */ 173 TAILQ_HEAD(, inotify_watch) deadwatches; /* watches pending vrele() */ 174 struct task reaptask; /* task to reap dead watches */ 175 struct selinfo sel; /* select/poll/kevent info */ 176 struct ucred *cred; /* credential ref */ 177 }; 178 179 static struct inotify_record * 180 inotify_dequeue(struct inotify_softc *sc) 181 { 182 struct inotify_record *rec; 183 184 mtx_assert(&sc->lock, MA_OWNED); 185 KASSERT(!STAILQ_EMPTY(&sc->pending), 186 ("%s: queue for %p is empty", __func__, sc)); 187 188 rec = STAILQ_FIRST(&sc->pending); 189 STAILQ_REMOVE_HEAD(&sc->pending, link); 190 sc->npending--; 191 sc->nbpending -= sizeof(rec->ev) + rec->ev.len; 192 return (rec); 193 } 194 195 static void 196 inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head) 197 { 198 mtx_assert(&sc->lock, MA_OWNED); 199 200 if (head) 201 STAILQ_INSERT_HEAD(&sc->pending, rec, link); 202 else 203 STAILQ_INSERT_TAIL(&sc->pending, rec, link); 204 sc->npending++; 205 sc->nbpending += sizeof(rec->ev) + rec->ev.len; 206 } 207 208 static int 209 inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags, 210 struct thread *td) 211 { 212 struct inotify_softc *sc; 213 struct inotify_record *rec; 214 int error; 215 bool first; 216 217 sc = fp->f_data; 218 error = 0; 219 220 mtx_lock(&sc->lock); 221 while (STAILQ_EMPTY(&sc->pending)) { 222 if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) { 223 mtx_unlock(&sc->lock); 224 return (EWOULDBLOCK); 225 } 226 error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0); 227 if (error != 0) { 228 mtx_unlock(&sc->lock); 229 return (error); 230 } 231 } 232 for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) { 233 size_t len; 234 235 rec = inotify_dequeue(sc); 236 len = sizeof(rec->ev) + rec->ev.len; 237 if (uio->uio_resid < (ssize_t)len) { 238 inotify_enqueue(sc, rec, true); 239 if (first) { 240 error = EXTERROR(EINVAL, 241 "read buffer is too small"); 242 } 243 break; 244 } 245 mtx_unlock(&sc->lock); 246 error = uiomove(&rec->ev, len, uio); 247 #ifdef KTRACE 248 if (error == 0 && KTRPOINT(td, KTR_STRUCT)) 249 ktrstruct("inotify", &rec->ev, len); 250 #endif 251 mtx_lock(&sc->lock); 252 if (error != 0) { 253 inotify_enqueue(sc, rec, true); 254 mtx_unlock(&sc->lock); 255 return (error); 256 } 257 if (rec == &sc->overflow) { 258 /* 259 * Signal to inotify_queue_record() that the overflow 260 * record can be reused. 261 */ 262 memset(rec, 0, sizeof(*rec)); 263 } else { 264 free(rec, M_INOTIFY); 265 } 266 } 267 mtx_unlock(&sc->lock); 268 return (error); 269 } 270 271 static int 272 inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred, 273 struct thread *td) 274 { 275 struct inotify_softc *sc; 276 277 sc = fp->f_data; 278 279 switch (com) { 280 case FIONREAD: 281 *(int *)data = (int)sc->nbpending; 282 return (0); 283 case FIONBIO: 284 case FIOASYNC: 285 return (0); 286 default: 287 return (ENOTTY); 288 } 289 290 return (0); 291 } 292 293 static int 294 inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td) 295 { 296 struct inotify_softc *sc; 297 int revents; 298 299 sc = fp->f_data; 300 revents = 0; 301 302 mtx_lock(&sc->lock); 303 if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0) 304 revents |= events & (POLLIN | POLLRDNORM); 305 else 306 selrecord(td, &sc->sel); 307 mtx_unlock(&sc->lock); 308 return (revents); 309 } 310 311 static void 312 filt_inotifydetach(struct knote *kn) 313 { 314 struct inotify_softc *sc; 315 316 sc = kn->kn_hook; 317 knlist_remove(&sc->sel.si_note, kn, 0); 318 } 319 320 static int 321 filt_inotifyevent(struct knote *kn, long hint) 322 { 323 struct inotify_softc *sc; 324 325 sc = kn->kn_hook; 326 mtx_assert(&sc->lock, MA_OWNED); 327 kn->kn_data = sc->nbpending; 328 return (kn->kn_data > 0); 329 } 330 331 static int 332 inotify_kqfilter(struct file *fp, struct knote *kn) 333 { 334 struct inotify_softc *sc; 335 336 if (kn->kn_filter != EVFILT_READ) 337 return (EINVAL); 338 sc = fp->f_data; 339 kn->kn_fop = &inotify_rfiltops; 340 kn->kn_hook = sc; 341 knlist_add(&sc->sel.si_note, kn, 0); 342 return (0); 343 } 344 345 static int 346 inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred) 347 { 348 struct inotify_softc *sc; 349 350 sc = fp->f_data; 351 352 memset(sb, 0, sizeof(*sb)); 353 sb->st_mode = S_IFREG | S_IRUSR; 354 sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX); 355 mtx_lock(&sc->lock); 356 sb->st_size = sc->nbpending; 357 sb->st_blocks = sc->npending; 358 sb->st_uid = sc->cred->cr_ruid; 359 sb->st_gid = sc->cred->cr_rgid; 360 sb->st_ino = sc->ino; 361 mtx_unlock(&sc->lock); 362 return (0); 363 } 364 365 static void 366 inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch) 367 { 368 struct vnode *vp; 369 370 vp = watch->vp; 371 mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED); 372 373 atomic_subtract_int(&inotify_watches, 1); 374 (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0); 375 376 TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink); 377 if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify)) 378 vn_irflag_unset(vp, VIRF_INOTIFY); 379 } 380 381 static void 382 inotify_free_watch(struct inotify_watch *watch) 383 { 384 /* 385 * Formally, we don't need to lock the vnode here. However, if we 386 * don't, and vrele() releases the last reference, it's possible the 387 * vnode will be recycled while a different thread holds the vnode lock. 388 * Work around this bug by acquiring the lock here. 389 */ 390 (void)vn_lock(watch->vp, LK_EXCLUSIVE | LK_RETRY); 391 vput(watch->vp); 392 free(watch, M_INOTIFY); 393 } 394 395 /* 396 * Assumes that the watch has already been removed from its softc. 397 */ 398 static void 399 inotify_remove_watch(struct inotify_watch *watch) 400 { 401 struct inotify_softc *sc; 402 struct vnode *vp; 403 404 sc = watch->sc; 405 406 vp = watch->vp; 407 mtx_lock(&vp->v_pollinfo->vpi_lock); 408 inotify_unlink_watch_locked(sc, watch); 409 mtx_unlock(&vp->v_pollinfo->vpi_lock); 410 inotify_free_watch(watch); 411 } 412 413 static void 414 inotify_reap(void *arg, int pending) 415 { 416 struct inotify_softc *sc; 417 struct inotify_watch *watch; 418 419 sc = arg; 420 mtx_lock(&sc->lock); 421 while ((watch = TAILQ_FIRST(&sc->deadwatches)) != NULL) { 422 TAILQ_REMOVE(&sc->deadwatches, watch, vlink); 423 mtx_unlock(&sc->lock); 424 inotify_free_watch(watch); 425 mtx_lock(&sc->lock); 426 } 427 mtx_unlock(&sc->lock); 428 } 429 430 static int 431 inotify_close(struct file *fp, struct thread *td) 432 { 433 struct inotify_softc *sc; 434 struct inotify_record *rec; 435 struct inotify_watch *watch; 436 437 sc = fp->f_data; 438 439 /* Detach watches from their vnodes. */ 440 mtx_lock(&sc->lock); 441 (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0); 442 while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) { 443 RB_REMOVE(inotify_watch_tree, &sc->watches, watch); 444 mtx_unlock(&sc->lock); 445 inotify_remove_watch(watch); 446 mtx_lock(&sc->lock); 447 } 448 449 /* Make sure that any asynchronous vrele() calls are done. */ 450 mtx_unlock(&sc->lock); 451 taskqueue_drain(taskqueue_thread, &sc->reaptask); 452 mtx_lock(&sc->lock); 453 KASSERT(RB_EMPTY(&sc->watches), 454 ("%s: watches not empty in %p", __func__, sc)); 455 KASSERT(TAILQ_EMPTY(&sc->deadwatches), 456 ("%s: deadwatches not empty in %p", __func__, sc)); 457 458 /* Drop pending events. */ 459 while (!STAILQ_EMPTY(&sc->pending)) { 460 rec = inotify_dequeue(sc); 461 if (rec != &sc->overflow) 462 free(rec, M_INOTIFY); 463 } 464 mtx_unlock(&sc->lock); 465 seldrain(&sc->sel); 466 knlist_destroy(&sc->sel.si_note); 467 mtx_destroy(&sc->lock); 468 crfree(sc->cred); 469 free(sc, M_INOTIFY); 470 return (0); 471 } 472 473 static int 474 inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif, 475 struct filedesc *fdp) 476 { 477 struct inotify_softc *sc; 478 479 sc = fp->f_data; 480 481 kif->kf_type = KF_TYPE_INOTIFY; 482 kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending; 483 kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending; 484 return (0); 485 } 486 487 int 488 inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp) 489 { 490 struct inotify_softc *sc; 491 int fflags; 492 493 if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0) 494 return (EINVAL); 495 496 if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1, 497 inotify_max_user_instances)) 498 return (EMFILE); 499 500 sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO); 501 sc->nextwatch = 1; /* Required for compatibility. */ 502 STAILQ_INIT(&sc->pending); 503 RB_INIT(&sc->watches); 504 TAILQ_INIT(&sc->deadwatches); 505 TASK_INIT(&sc->reaptask, 0, inotify_reap, sc); 506 mtx_init(&sc->lock, "inotify", NULL, MTX_DEF); 507 knlist_init_mtx(&sc->sel.si_note, &sc->lock); 508 sc->cred = crhold(td->td_ucred); 509 sc->ino = atomic_fetchadd_64(&inotify_ino, 1); 510 511 fflags = FREAD; 512 if ((flags & IN_NONBLOCK) != 0) 513 fflags |= FNONBLOCK; 514 if ((flags & IN_CLOEXEC) != 0) 515 *fflagsp |= O_CLOEXEC; 516 finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops); 517 518 return (0); 519 } 520 521 static struct inotify_record * 522 inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event, 523 uint32_t cookie, int waitok) 524 { 525 struct inotify_event *evp; 526 struct inotify_record *rec; 527 528 rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY, 529 waitok | M_ZERO); 530 if (rec == NULL) 531 return (NULL); 532 evp = &rec->ev; 533 evp->wd = wd; 534 evp->mask = event; 535 evp->cookie = cookie; 536 evp->len = _IN_NAMESIZE(namelen); 537 if (name != NULL) 538 memcpy(evp->name, name, namelen); 539 return (rec); 540 } 541 542 static bool 543 inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp) 544 { 545 struct inotify_record *prev; 546 547 mtx_assert(&sc->lock, MA_OWNED); 548 549 prev = STAILQ_LAST(&sc->pending, inotify_record, link); 550 return (prev != NULL && prev->ev.mask == evp->mask && 551 prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie && 552 prev->ev.len == evp->len && 553 memcmp(prev->ev.name, evp->name, evp->len) == 0); 554 } 555 556 static void 557 inotify_overflow_event(struct inotify_event *evp) 558 { 559 evp->mask = IN_Q_OVERFLOW; 560 evp->wd = -1; 561 evp->cookie = 0; 562 evp->len = 0; 563 } 564 565 /* 566 * Put an event record on the queue for an inotify desscriptor. Return false if 567 * the record was not enqueued for some reason, true otherwise. 568 */ 569 static bool 570 inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec) 571 { 572 struct inotify_event *evp; 573 574 mtx_assert(&sc->lock, MA_OWNED); 575 576 evp = &rec->ev; 577 if (__predict_false(rec == &sc->overflow)) { 578 /* 579 * Is the overflow record already in the queue? If so, there's 580 * not much else we can do: we're here because a kernel memory 581 * shortage prevented new record allocations. 582 */ 583 counter_u64_add(inotify_event_drops, 1); 584 if (evp->mask == IN_Q_OVERFLOW) 585 return (false); 586 inotify_overflow_event(evp); 587 } else { 588 /* Try to coalesce duplicate events. */ 589 if (inotify_coalesce && inotify_can_coalesce(sc, evp)) 590 return (false); 591 592 /* 593 * Would this one overflow the queue? If so, convert it to an 594 * overflow event and try again to coalesce. 595 */ 596 if (sc->npending >= inotify_max_queued_events) { 597 counter_u64_add(inotify_event_drops, 1); 598 inotify_overflow_event(evp); 599 if (inotify_can_coalesce(sc, evp)) 600 return (false); 601 } 602 } 603 inotify_enqueue(sc, rec, false); 604 selwakeup(&sc->sel); 605 KNOTE_LOCKED(&sc->sel.si_note, 0); 606 wakeup(&sc->pending); 607 return (true); 608 } 609 610 static void 611 inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen, 612 int event, uint32_t cookie) 613 { 614 struct inotify_watch key; 615 struct inotify_softc *sc; 616 struct inotify_record *rec; 617 bool allocfail; 618 619 mtx_assert(&watch->vp->v_pollinfo->vpi_lock, MA_OWNED); 620 621 sc = watch->sc; 622 rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie, 623 M_NOWAIT); 624 if (rec == NULL) { 625 rec = &sc->overflow; 626 allocfail = true; 627 } else { 628 allocfail = false; 629 } 630 631 mtx_lock(&sc->lock); 632 if (!inotify_queue_record(sc, rec) && rec != &sc->overflow) 633 free(rec, M_INOTIFY); 634 if ((watch->mask & IN_ONESHOT) != 0 || 635 (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) { 636 if (!allocfail) { 637 rec = inotify_alloc_record(watch->wd, NULL, 0, 638 IN_IGNORED, 0, M_NOWAIT); 639 if (rec == NULL) 640 rec = &sc->overflow; 641 if (!inotify_queue_record(sc, rec) && 642 rec != &sc->overflow) 643 free(rec, M_INOTIFY); 644 } 645 646 /* 647 * Remove the watch, taking care to handle races with 648 * inotify_close(). The thread that removes the watch is 649 * responsible for freeing it. 650 */ 651 key.wd = watch->wd; 652 if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) { 653 RB_REMOVE(inotify_watch_tree, &sc->watches, watch); 654 inotify_unlink_watch_locked(sc, watch); 655 656 /* 657 * Defer the vrele() to a sleepable thread context. 658 */ 659 TAILQ_INSERT_TAIL(&sc->deadwatches, watch, vlink); 660 taskqueue_enqueue(taskqueue_thread, &sc->reaptask); 661 } 662 } 663 mtx_unlock(&sc->lock); 664 } 665 666 void 667 inotify_log(struct vnode *vp, const char *name, size_t namelen, int event, 668 uint32_t cookie) 669 { 670 struct inotify_watch *watch, *tmp; 671 672 KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0, 673 ("inotify_log: invalid event %#x", event)); 674 675 mtx_lock(&vp->v_pollinfo->vpi_lock); 676 TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) { 677 KASSERT(watch->vp == vp, 678 ("inotify_log: watch %p vp != vp", watch)); 679 if ((watch->mask & event) != 0 || event == IN_UNMOUNT) 680 inotify_log_one(watch, name, namelen, event, cookie); 681 } 682 mtx_unlock(&vp->v_pollinfo->vpi_lock); 683 } 684 685 /* 686 * An inotify event occurred on a watched vnode. 687 */ 688 void 689 vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp, 690 int event, uint32_t cookie) 691 { 692 int isdir; 693 694 VNPASS(vp->v_holdcnt > 0, vp); 695 696 isdir = vp->v_type == VDIR ? IN_ISDIR : 0; 697 698 if (dvp != NULL) { 699 VNPASS(dvp->v_holdcnt > 0, dvp); 700 701 /* 702 * Should we log an event for the vnode itself? 703 */ 704 if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) { 705 int selfevent; 706 707 switch (event) { 708 case _IN_MOVE_DELETE: 709 case IN_DELETE: 710 /* 711 * IN_DELETE_SELF is only generated when the 712 * last hard link of a file is removed. 713 */ 714 selfevent = IN_DELETE_SELF; 715 if (vp->v_type != VDIR) { 716 struct vattr va; 717 int error; 718 719 error = VOP_GETATTR(vp, &va, 720 cnp->cn_cred); 721 if (error == 0 && va.va_nlink != 0) 722 selfevent = 0; 723 } 724 break; 725 case IN_MOVED_FROM: 726 cookie = 0; 727 selfevent = IN_MOVE_SELF; 728 break; 729 case _IN_ATTRIB_LINKCOUNT: 730 selfevent = IN_ATTRIB; 731 break; 732 default: 733 selfevent = event; 734 break; 735 } 736 737 if ((selfevent & ~_IN_DIR_EVENTS) != 0) { 738 inotify_log(vp, NULL, 0, selfevent | isdir, 739 cookie); 740 } 741 } 742 743 /* 744 * Something is watching the directory through which this vnode 745 * was referenced, so we may need to log the event. 746 */ 747 if ((event & IN_ALL_EVENTS) != 0 && 748 (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) { 749 inotify_log(dvp, cnp->cn_nameptr, 750 cnp->cn_namelen, event | isdir, cookie); 751 } 752 } else { 753 /* 754 * We don't know which watched directory might contain the 755 * vnode, so we have to fall back to searching the name cache. 756 */ 757 cache_vop_inotify(vp, event, cookie); 758 } 759 } 760 761 int 762 vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask, 763 uint32_t *wdp, struct thread *td) 764 { 765 struct inotify_watch *watch, *watch1; 766 uint32_t wd; 767 768 /* 769 * If this is a directory, make sure all of its entries are present in 770 * the name cache so that we're able to look them up if an event occurs. 771 * The persistent reference on the directory prevents the outgoing name 772 * cache entries from being reclaimed. 773 */ 774 if (vp->v_type == VDIR) { 775 struct dirent *dp; 776 char *buf; 777 off_t off; 778 size_t buflen, len; 779 int eof, error; 780 781 buflen = 128 * sizeof(struct dirent); 782 buf = malloc(buflen, M_TEMP, M_WAITOK); 783 784 error = 0; 785 len = off = eof = 0; 786 for (;;) { 787 struct nameidata nd; 788 789 error = vn_dir_next_dirent(vp, td, buf, buflen, &dp, 790 &len, &off, &eof); 791 if (error != 0) 792 break; 793 if (len == 0) 794 /* Finished reading. */ 795 break; 796 if (strcmp(dp->d_name, ".") == 0 || 797 strcmp(dp->d_name, "..") == 0) 798 continue; 799 800 /* 801 * namei() consumes a reference on the starting 802 * directory if it's specified as a vnode. 803 */ 804 vrefact(vp); 805 VOP_UNLOCK(vp); 806 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, 807 dp->d_name, vp); 808 error = namei(&nd); 809 vn_lock(vp, LK_SHARED | LK_RETRY); 810 if (error != 0) 811 break; 812 NDFREE_PNBUF(&nd); 813 vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT); 814 vrele(nd.ni_vp); 815 } 816 free(buf, M_TEMP); 817 if (error != 0) 818 return (error); 819 } 820 821 /* 822 * The vnode referenced in kern_inotify_add_watch() might be different 823 * than this one if nullfs is in the picture. 824 */ 825 vrefact(vp); 826 watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO); 827 watch->sc = sc; 828 watch->vp = vp; 829 watch->mask = mask; 830 831 /* 832 * Are we updating an existing watch? Search the vnode's list rather 833 * than that of the softc, as the former is likely to be shorter. 834 */ 835 v_addpollinfo(vp); 836 mtx_lock(&vp->v_pollinfo->vpi_lock); 837 TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) { 838 if (watch1->sc == sc) 839 break; 840 } 841 mtx_lock(&sc->lock); 842 if (watch1 != NULL) { 843 mtx_unlock(&vp->v_pollinfo->vpi_lock); 844 845 /* 846 * We found an existing watch, update it based on our flags. 847 */ 848 if ((mask & IN_MASK_CREATE) != 0) { 849 mtx_unlock(&sc->lock); 850 vrele(vp); 851 free(watch, M_INOTIFY); 852 return (EEXIST); 853 } 854 if ((mask & IN_MASK_ADD) != 0) 855 watch1->mask |= mask; 856 else 857 watch1->mask = mask; 858 *wdp = watch1->wd; 859 mtx_unlock(&sc->lock); 860 vrele(vp); 861 free(watch, M_INOTIFY); 862 return (EJUSTRETURN); 863 } 864 865 /* 866 * We're creating a new watch. Add it to the softc and vnode watch 867 * lists. 868 */ 869 do { 870 struct inotify_watch key; 871 872 /* 873 * Search for the next available watch descriptor. This is 874 * implemented so as to avoid reusing watch descriptors for as 875 * long as possible. 876 */ 877 key.wd = wd = sc->nextwatch++; 878 watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key); 879 } while (watch1 != NULL || wd == 0); 880 watch->wd = wd; 881 RB_INSERT(inotify_watch_tree, &sc->watches, watch); 882 TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink); 883 mtx_unlock(&sc->lock); 884 mtx_unlock(&vp->v_pollinfo->vpi_lock); 885 vn_irflag_set_cond(vp, VIRF_INOTIFY); 886 887 *wdp = wd; 888 889 return (0); 890 } 891 892 void 893 vn_inotify_revoke(struct vnode *vp) 894 { 895 if (vp->v_pollinfo == NULL) { 896 /* This is a nullfs vnode which shadows a watched vnode. */ 897 return; 898 } 899 inotify_log(vp, NULL, 0, IN_UNMOUNT, 0); 900 } 901 902 static int 903 fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp, 904 struct file **fpp) 905 { 906 struct file *fp; 907 int error; 908 909 error = fget(td, fd, needrightsp, &fp); 910 if (error != 0) 911 return (error); 912 if (fp->f_type != DTYPE_INOTIFY) { 913 fdrop(fp, td); 914 return (EINVAL); 915 } 916 *fpp = fp; 917 return (0); 918 } 919 920 int 921 kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask, 922 struct thread *td) 923 { 924 struct nameidata nd; 925 struct file *fp; 926 struct inotify_softc *sc; 927 struct vnode *vp; 928 uint32_t wd; 929 int count, error; 930 931 fp = NULL; 932 vp = NULL; 933 934 if ((mask & IN_ALL_EVENTS) == 0) 935 return (EXTERROR(EINVAL, "no events specified")); 936 if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) == 937 (IN_MASK_ADD | IN_MASK_CREATE)) 938 return (EXTERROR(EINVAL, 939 "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive")); 940 if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0) 941 return (EXTERROR(EINVAL, "unrecognized flag")); 942 943 error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp); 944 if (error != 0) 945 return (error); 946 sc = fp->f_data; 947 948 NDINIT_AT(&nd, LOOKUP, 949 ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF | 950 LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd); 951 error = namei(&nd); 952 if (error != 0) 953 goto out; 954 NDFREE_PNBUF(&nd); 955 vp = nd.ni_vp; 956 957 error = VOP_ACCESS(vp, VREAD, td->td_ucred, td); 958 if (error != 0) 959 goto out; 960 961 if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) { 962 error = ENOTDIR; 963 goto out; 964 } 965 966 count = atomic_fetchadd_int(&inotify_watches, 1); 967 if (count > inotify_max_watches) { 968 atomic_subtract_int(&inotify_watches, 1); 969 error = ENOSPC; 970 goto out; 971 } 972 if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1, 973 inotify_max_user_watches)) { 974 atomic_subtract_int(&inotify_watches, 1); 975 error = ENOSPC; 976 goto out; 977 } 978 error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td); 979 if (error != 0) { 980 atomic_subtract_int(&inotify_watches, 1); 981 (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0); 982 if (error == EJUSTRETURN) { 983 /* We updated an existing watch, everything is ok. */ 984 error = 0; 985 } else { 986 goto out; 987 } 988 } 989 td->td_retval[0] = wd; 990 991 out: 992 if (vp != NULL) 993 vput(vp); 994 fdrop(fp, td); 995 return (error); 996 } 997 998 int 999 sys_inotify_add_watch_at(struct thread *td, 1000 struct inotify_add_watch_at_args *uap) 1001 { 1002 return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path, 1003 uap->mask, td)); 1004 } 1005 1006 int 1007 kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td) 1008 { 1009 struct file *fp; 1010 struct inotify_softc *sc; 1011 struct inotify_record *rec; 1012 struct inotify_watch key, *watch; 1013 int error; 1014 1015 error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp); 1016 if (error != 0) 1017 return (error); 1018 sc = fp->f_data; 1019 1020 rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK); 1021 1022 /* 1023 * For compatibility with Linux, we do not remove pending events 1024 * associated with the watch. Watch descriptors are implemented so as 1025 * to avoid being reused for as long as possible, so one hopes that any 1026 * pending events from the removed watch descriptor will be removed 1027 * before the watch descriptor is recycled. 1028 */ 1029 key.wd = wd; 1030 mtx_lock(&sc->lock); 1031 watch = RB_FIND(inotify_watch_tree, &sc->watches, &key); 1032 if (watch == NULL) { 1033 free(rec, M_INOTIFY); 1034 error = EINVAL; 1035 } else { 1036 RB_REMOVE(inotify_watch_tree, &sc->watches, watch); 1037 if (!inotify_queue_record(sc, rec)) { 1038 free(rec, M_INOTIFY); 1039 error = 0; 1040 } 1041 } 1042 mtx_unlock(&sc->lock); 1043 if (watch != NULL) 1044 inotify_remove_watch(watch); 1045 fdrop(fp, td); 1046 return (error); 1047 } 1048 1049 int 1050 sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap) 1051 { 1052 return (kern_inotify_rm_watch(uap->fd, uap->wd, td)); 1053 } 1054