1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2025 Klara, Inc. 5 */ 6 7 #include "opt_ktrace.h" 8 9 #include <sys/param.h> 10 #include <sys/systm.h> 11 #include <sys/caprights.h> 12 #include <sys/counter.h> 13 #include <sys/dirent.h> 14 #define EXTERR_CATEGORY EXTERR_CAT_INOTIFY 15 #include <sys/exterrvar.h> 16 #include <sys/fcntl.h> 17 #include <sys/file.h> 18 #include <sys/filio.h> 19 #include <sys/inotify.h> 20 #include <sys/kernel.h> 21 #include <sys/lock.h> 22 #include <sys/ktrace.h> 23 #include <sys/malloc.h> 24 #include <sys/mutex.h> 25 #include <sys/namei.h> 26 #include <sys/poll.h> 27 #include <sys/proc.h> 28 #include <sys/queue.h> 29 #include <sys/resourcevar.h> 30 #include <sys/selinfo.h> 31 #include <sys/stat.h> 32 #include <sys/syscallsubr.h> 33 #include <sys/sysctl.h> 34 #include <sys/sysent.h> 35 #include <sys/syslimits.h> 36 #include <sys/sysproto.h> 37 #include <sys/taskqueue.h> 38 #include <sys/tree.h> 39 #include <sys/user.h> 40 #include <sys/vnode.h> 41 42 uint32_t inotify_rename_cookie; 43 44 static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 45 "inotify configuration"); 46 47 static int inotify_max_queued_events = 16384; 48 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN, 49 &inotify_max_queued_events, 0, 50 "Maximum number of events to queue on an inotify descriptor"); 51 52 static int inotify_max_user_instances = 256; 53 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN, 54 &inotify_max_user_instances, 0, 55 "Maximum number of inotify descriptors per user"); 56 57 static int inotify_max_user_watches; 58 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN, 59 &inotify_max_user_watches, 0, 60 "Maximum number of inotify watches per user"); 61 62 static int inotify_max_watches; 63 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN, 64 &inotify_max_watches, 0, 65 "Maximum number of inotify watches system-wide"); 66 67 static int inotify_watches; 68 SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD, 69 &inotify_watches, 0, 70 "Total number of inotify watches currently in use"); 71 72 static int inotify_coalesce = 1; 73 SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN, 74 &inotify_coalesce, 0, 75 "Coalesce inotify events when possible"); 76 77 static COUNTER_U64_DEFINE_EARLY(inotify_event_drops); 78 SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD, 79 &inotify_event_drops, 80 "Number of inotify events dropped due to limits or allocation failures"); 81 82 static fo_rdwr_t inotify_read; 83 static fo_ioctl_t inotify_ioctl; 84 static fo_poll_t inotify_poll; 85 static fo_kqfilter_t inotify_kqfilter; 86 static fo_stat_t inotify_stat; 87 static fo_close_t inotify_close; 88 static fo_fill_kinfo_t inotify_fill_kinfo; 89 90 static const struct fileops inotifyfdops = { 91 .fo_read = inotify_read, 92 .fo_write = invfo_rdwr, 93 .fo_truncate = invfo_truncate, 94 .fo_ioctl = inotify_ioctl, 95 .fo_poll = inotify_poll, 96 .fo_kqfilter = inotify_kqfilter, 97 .fo_stat = inotify_stat, 98 .fo_close = inotify_close, 99 .fo_chmod = invfo_chmod, 100 .fo_chown = invfo_chown, 101 .fo_sendfile = invfo_sendfile, 102 .fo_fill_kinfo = inotify_fill_kinfo, 103 .fo_cmp = file_kcmp_generic, 104 .fo_flags = DFLAG_PASSABLE, 105 }; 106 107 static void filt_inotifydetach(struct knote *kn); 108 static int filt_inotifyevent(struct knote *kn, long hint); 109 110 static const struct filterops inotify_rfiltops = { 111 .f_isfd = 1, 112 .f_detach = filt_inotifydetach, 113 .f_event = filt_inotifyevent, 114 .f_copy = knote_triv_copy, 115 }; 116 117 static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures"); 118 119 struct inotify_record { 120 STAILQ_ENTRY(inotify_record) link; 121 struct inotify_event ev; 122 }; 123 124 static uint64_t inotify_ino = 1; 125 126 /* 127 * On LP64 systems this occupies 64 bytes, so we don't get internal 128 * fragmentation by allocating watches with malloc(9). If the size changes, 129 * consider using a UMA zone to improve memory efficiency. 130 */ 131 struct inotify_watch { 132 struct inotify_softc *sc; /* back-pointer */ 133 int wd; /* unique ID */ 134 uint32_t mask; /* event mask */ 135 struct vnode *vp; /* vnode being watched, refed */ 136 RB_ENTRY(inotify_watch) ilink; /* inotify linkage */ 137 TAILQ_ENTRY(inotify_watch) vlink; /* vnode linkage */ 138 }; 139 140 static void 141 inotify_init(void *arg __unused) 142 { 143 /* Don't let a user hold too many vnodes. */ 144 inotify_max_user_watches = desiredvnodes / 3; 145 /* Don't let the system hold too many vnodes. */ 146 inotify_max_watches = desiredvnodes / 2; 147 } 148 SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL); 149 150 static int 151 inotify_watch_cmp(const struct inotify_watch *a, 152 const struct inotify_watch *b) 153 { 154 if (a->wd < b->wd) 155 return (-1); 156 else if (a->wd > b->wd) 157 return (1); 158 else 159 return (0); 160 } 161 RB_HEAD(inotify_watch_tree, inotify_watch); 162 RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp); 163 164 struct inotify_softc { 165 struct mtx lock; /* serialize all softc writes */ 166 STAILQ_HEAD(, inotify_record) pending; /* events waiting to be read */ 167 struct inotify_record overflow; /* preallocated record */ 168 int nextwatch; /* next watch ID to try */ 169 int npending; /* number of pending events */ 170 size_t nbpending; /* bytes available to read */ 171 uint64_t ino; /* unique identifier */ 172 struct inotify_watch_tree watches; /* active watches */ 173 TAILQ_HEAD(, inotify_watch) deadwatches; /* watches pending vrele() */ 174 struct task reaptask; /* task to reap dead watches */ 175 struct selinfo sel; /* select/poll/kevent info */ 176 struct ucred *cred; /* credential ref */ 177 }; 178 179 static struct inotify_record * 180 inotify_dequeue(struct inotify_softc *sc) 181 { 182 struct inotify_record *rec; 183 184 mtx_assert(&sc->lock, MA_OWNED); 185 KASSERT(!STAILQ_EMPTY(&sc->pending), 186 ("%s: queue for %p is empty", __func__, sc)); 187 188 rec = STAILQ_FIRST(&sc->pending); 189 STAILQ_REMOVE_HEAD(&sc->pending, link); 190 sc->npending--; 191 sc->nbpending -= sizeof(rec->ev) + rec->ev.len; 192 return (rec); 193 } 194 195 static void 196 inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head) 197 { 198 mtx_assert(&sc->lock, MA_OWNED); 199 200 if (head) 201 STAILQ_INSERT_HEAD(&sc->pending, rec, link); 202 else 203 STAILQ_INSERT_TAIL(&sc->pending, rec, link); 204 sc->npending++; 205 sc->nbpending += sizeof(rec->ev) + rec->ev.len; 206 } 207 208 static int 209 inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags, 210 struct thread *td) 211 { 212 struct inotify_softc *sc; 213 struct inotify_record *rec; 214 int error; 215 bool first; 216 217 sc = fp->f_data; 218 error = 0; 219 220 mtx_lock(&sc->lock); 221 while (STAILQ_EMPTY(&sc->pending)) { 222 if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) { 223 mtx_unlock(&sc->lock); 224 return (EWOULDBLOCK); 225 } 226 error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0); 227 if (error != 0) { 228 mtx_unlock(&sc->lock); 229 return (error); 230 } 231 } 232 for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) { 233 size_t len; 234 235 rec = inotify_dequeue(sc); 236 len = sizeof(rec->ev) + rec->ev.len; 237 if (uio->uio_resid < (ssize_t)len) { 238 inotify_enqueue(sc, rec, true); 239 if (first) { 240 error = EXTERROR(EINVAL, 241 "read buffer is too small"); 242 } 243 break; 244 } 245 mtx_unlock(&sc->lock); 246 error = uiomove(&rec->ev, len, uio); 247 #ifdef KTRACE 248 if (error == 0 && KTRPOINT(td, KTR_STRUCT)) 249 ktrstruct("inotify", &rec->ev, len); 250 #endif 251 mtx_lock(&sc->lock); 252 if (error != 0) { 253 inotify_enqueue(sc, rec, true); 254 mtx_unlock(&sc->lock); 255 return (error); 256 } 257 if (rec == &sc->overflow) { 258 /* 259 * Signal to inotify_queue_record() that the overflow 260 * record can be reused. 261 */ 262 memset(rec, 0, sizeof(*rec)); 263 } else { 264 free(rec, M_INOTIFY); 265 } 266 } 267 mtx_unlock(&sc->lock); 268 return (error); 269 } 270 271 static int 272 inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred, 273 struct thread *td) 274 { 275 struct inotify_softc *sc; 276 277 sc = fp->f_data; 278 279 switch (com) { 280 case FIONREAD: 281 *(int *)data = (int)sc->nbpending; 282 return (0); 283 case FIONBIO: 284 case FIOASYNC: 285 return (0); 286 default: 287 return (ENOTTY); 288 } 289 290 return (0); 291 } 292 293 static int 294 inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td) 295 { 296 struct inotify_softc *sc; 297 int revents; 298 299 sc = fp->f_data; 300 revents = 0; 301 302 mtx_lock(&sc->lock); 303 if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0) 304 revents |= events & (POLLIN | POLLRDNORM); 305 else 306 selrecord(td, &sc->sel); 307 mtx_unlock(&sc->lock); 308 return (revents); 309 } 310 311 static void 312 filt_inotifydetach(struct knote *kn) 313 { 314 struct inotify_softc *sc; 315 316 sc = kn->kn_hook; 317 knlist_remove(&sc->sel.si_note, kn, 0); 318 } 319 320 static int 321 filt_inotifyevent(struct knote *kn, long hint) 322 { 323 struct inotify_softc *sc; 324 325 sc = kn->kn_hook; 326 mtx_assert(&sc->lock, MA_OWNED); 327 kn->kn_data = sc->nbpending; 328 return (kn->kn_data > 0); 329 } 330 331 static int 332 inotify_kqfilter(struct file *fp, struct knote *kn) 333 { 334 struct inotify_softc *sc; 335 336 if (kn->kn_filter != EVFILT_READ) 337 return (EINVAL); 338 sc = fp->f_data; 339 kn->kn_fop = &inotify_rfiltops; 340 kn->kn_hook = sc; 341 knlist_add(&sc->sel.si_note, kn, 0); 342 return (0); 343 } 344 345 static int 346 inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred) 347 { 348 struct inotify_softc *sc; 349 350 sc = fp->f_data; 351 352 memset(sb, 0, sizeof(*sb)); 353 sb->st_mode = S_IFREG | S_IRUSR; 354 sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX); 355 mtx_lock(&sc->lock); 356 sb->st_size = sc->nbpending; 357 sb->st_blocks = sc->npending; 358 sb->st_uid = sc->cred->cr_ruid; 359 sb->st_gid = sc->cred->cr_rgid; 360 sb->st_ino = sc->ino; 361 mtx_unlock(&sc->lock); 362 return (0); 363 } 364 365 static void 366 inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch) 367 { 368 struct vnode *vp; 369 370 vp = watch->vp; 371 mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED); 372 373 atomic_subtract_int(&inotify_watches, 1); 374 (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0); 375 376 TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink); 377 if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify)) 378 vn_irflag_unset(vp, VIRF_INOTIFY); 379 } 380 381 static void 382 inotify_free_watch(struct inotify_watch *watch) 383 { 384 vrele(watch->vp); 385 free(watch, M_INOTIFY); 386 } 387 388 /* 389 * Assumes that the watch has already been removed from its softc. 390 */ 391 static void 392 inotify_remove_watch(struct inotify_watch *watch) 393 { 394 struct inotify_softc *sc; 395 struct vnode *vp; 396 397 sc = watch->sc; 398 399 vp = watch->vp; 400 mtx_lock(&vp->v_pollinfo->vpi_lock); 401 inotify_unlink_watch_locked(sc, watch); 402 mtx_unlock(&vp->v_pollinfo->vpi_lock); 403 inotify_free_watch(watch); 404 } 405 406 static void 407 inotify_reap(void *arg, int pending) 408 { 409 struct inotify_softc *sc; 410 struct inotify_watch *watch; 411 412 sc = arg; 413 mtx_lock(&sc->lock); 414 while ((watch = TAILQ_FIRST(&sc->deadwatches)) != NULL) { 415 TAILQ_REMOVE(&sc->deadwatches, watch, vlink); 416 mtx_unlock(&sc->lock); 417 inotify_free_watch(watch); 418 mtx_lock(&sc->lock); 419 } 420 mtx_unlock(&sc->lock); 421 } 422 423 static int 424 inotify_close(struct file *fp, struct thread *td) 425 { 426 struct inotify_softc *sc; 427 struct inotify_record *rec; 428 struct inotify_watch *watch; 429 430 sc = fp->f_data; 431 432 /* Detach watches from their vnodes. */ 433 mtx_lock(&sc->lock); 434 (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0); 435 while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) { 436 RB_REMOVE(inotify_watch_tree, &sc->watches, watch); 437 mtx_unlock(&sc->lock); 438 inotify_remove_watch(watch); 439 mtx_lock(&sc->lock); 440 } 441 442 /* Make sure that any asynchronous vrele() calls are done. */ 443 mtx_unlock(&sc->lock); 444 taskqueue_drain(taskqueue_thread, &sc->reaptask); 445 mtx_lock(&sc->lock); 446 KASSERT(RB_EMPTY(&sc->watches), 447 ("%s: watches not empty in %p", __func__, sc)); 448 KASSERT(TAILQ_EMPTY(&sc->deadwatches), 449 ("%s: deadwatches not empty in %p", __func__, sc)); 450 451 /* Drop pending events. */ 452 while (!STAILQ_EMPTY(&sc->pending)) { 453 rec = inotify_dequeue(sc); 454 if (rec != &sc->overflow) 455 free(rec, M_INOTIFY); 456 } 457 mtx_unlock(&sc->lock); 458 seldrain(&sc->sel); 459 knlist_destroy(&sc->sel.si_note); 460 mtx_destroy(&sc->lock); 461 crfree(sc->cred); 462 free(sc, M_INOTIFY); 463 return (0); 464 } 465 466 static int 467 inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif, 468 struct filedesc *fdp) 469 { 470 struct inotify_softc *sc; 471 472 sc = fp->f_data; 473 474 kif->kf_type = KF_TYPE_INOTIFY; 475 kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending; 476 kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending; 477 return (0); 478 } 479 480 int 481 inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp) 482 { 483 struct inotify_softc *sc; 484 int fflags; 485 486 if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0) 487 return (EINVAL); 488 489 if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1, 490 inotify_max_user_instances)) 491 return (EMFILE); 492 493 sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO); 494 sc->nextwatch = 1; /* Required for compatibility. */ 495 STAILQ_INIT(&sc->pending); 496 RB_INIT(&sc->watches); 497 TAILQ_INIT(&sc->deadwatches); 498 TASK_INIT(&sc->reaptask, 0, inotify_reap, sc); 499 mtx_init(&sc->lock, "inotify", NULL, MTX_DEF); 500 knlist_init_mtx(&sc->sel.si_note, &sc->lock); 501 sc->cred = crhold(td->td_ucred); 502 sc->ino = atomic_fetchadd_64(&inotify_ino, 1); 503 504 fflags = FREAD; 505 if ((flags & IN_NONBLOCK) != 0) 506 fflags |= FNONBLOCK; 507 if ((flags & IN_CLOEXEC) != 0) 508 *fflagsp |= O_CLOEXEC; 509 finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops); 510 511 return (0); 512 } 513 514 static struct inotify_record * 515 inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event, 516 uint32_t cookie, int waitok) 517 { 518 struct inotify_event *evp; 519 struct inotify_record *rec; 520 521 rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY, 522 waitok | M_ZERO); 523 if (rec == NULL) 524 return (NULL); 525 evp = &rec->ev; 526 evp->wd = wd; 527 evp->mask = event; 528 evp->cookie = cookie; 529 evp->len = _IN_NAMESIZE(namelen); 530 if (name != NULL) 531 memcpy(evp->name, name, namelen); 532 return (rec); 533 } 534 535 static bool 536 inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp) 537 { 538 struct inotify_record *prev; 539 540 mtx_assert(&sc->lock, MA_OWNED); 541 542 prev = STAILQ_LAST(&sc->pending, inotify_record, link); 543 return (prev != NULL && prev->ev.mask == evp->mask && 544 prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie && 545 prev->ev.len == evp->len && 546 memcmp(prev->ev.name, evp->name, evp->len) == 0); 547 } 548 549 static void 550 inotify_overflow_event(struct inotify_event *evp) 551 { 552 evp->mask = IN_Q_OVERFLOW; 553 evp->wd = -1; 554 evp->cookie = 0; 555 evp->len = 0; 556 } 557 558 /* 559 * Put an event record on the queue for an inotify desscriptor. Return false if 560 * the record was not enqueued for some reason, true otherwise. 561 */ 562 static bool 563 inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec) 564 { 565 struct inotify_event *evp; 566 567 mtx_assert(&sc->lock, MA_OWNED); 568 569 evp = &rec->ev; 570 if (__predict_false(rec == &sc->overflow)) { 571 /* 572 * Is the overflow record already in the queue? If so, there's 573 * not much else we can do: we're here because a kernel memory 574 * shortage prevented new record allocations. 575 */ 576 counter_u64_add(inotify_event_drops, 1); 577 if (evp->mask == IN_Q_OVERFLOW) 578 return (false); 579 inotify_overflow_event(evp); 580 } else { 581 /* Try to coalesce duplicate events. */ 582 if (inotify_coalesce && inotify_can_coalesce(sc, evp)) 583 return (false); 584 585 /* 586 * Would this one overflow the queue? If so, convert it to an 587 * overflow event and try again to coalesce. 588 */ 589 if (sc->npending >= inotify_max_queued_events) { 590 counter_u64_add(inotify_event_drops, 1); 591 inotify_overflow_event(evp); 592 if (inotify_can_coalesce(sc, evp)) 593 return (false); 594 } 595 } 596 inotify_enqueue(sc, rec, false); 597 selwakeup(&sc->sel); 598 KNOTE_LOCKED(&sc->sel.si_note, 0); 599 wakeup(&sc->pending); 600 return (true); 601 } 602 603 static void 604 inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen, 605 int event, uint32_t cookie) 606 { 607 struct inotify_watch key; 608 struct inotify_softc *sc; 609 struct inotify_record *rec; 610 bool allocfail; 611 612 mtx_assert(&watch->vp->v_pollinfo->vpi_lock, MA_OWNED); 613 614 sc = watch->sc; 615 rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie, 616 M_NOWAIT); 617 if (rec == NULL) { 618 rec = &sc->overflow; 619 allocfail = true; 620 } else { 621 allocfail = false; 622 } 623 624 mtx_lock(&sc->lock); 625 if (!inotify_queue_record(sc, rec) && rec != &sc->overflow) 626 free(rec, M_INOTIFY); 627 if ((watch->mask & IN_ONESHOT) != 0 || 628 (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) { 629 if (!allocfail) { 630 rec = inotify_alloc_record(watch->wd, NULL, 0, 631 IN_IGNORED, 0, M_NOWAIT); 632 if (rec == NULL) 633 rec = &sc->overflow; 634 if (!inotify_queue_record(sc, rec) && 635 rec != &sc->overflow) 636 free(rec, M_INOTIFY); 637 } 638 639 /* 640 * Remove the watch, taking care to handle races with 641 * inotify_close(). The thread that removes the watch is 642 * responsible for freeing it. 643 */ 644 key.wd = watch->wd; 645 if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) { 646 RB_REMOVE(inotify_watch_tree, &sc->watches, watch); 647 inotify_unlink_watch_locked(sc, watch); 648 649 /* 650 * Defer the vrele() to a sleepable thread context. 651 */ 652 TAILQ_INSERT_TAIL(&sc->deadwatches, watch, vlink); 653 taskqueue_enqueue(taskqueue_thread, &sc->reaptask); 654 } 655 } 656 mtx_unlock(&sc->lock); 657 } 658 659 void 660 inotify_log(struct vnode *vp, const char *name, size_t namelen, int event, 661 uint32_t cookie) 662 { 663 struct inotify_watch *watch, *tmp; 664 665 KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0, 666 ("inotify_log: invalid event %#x", event)); 667 668 mtx_lock(&vp->v_pollinfo->vpi_lock); 669 TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) { 670 KASSERT(watch->vp == vp, 671 ("inotify_log: watch %p vp != vp", watch)); 672 if ((watch->mask & event) != 0 || event == IN_UNMOUNT) 673 inotify_log_one(watch, name, namelen, event, cookie); 674 } 675 mtx_unlock(&vp->v_pollinfo->vpi_lock); 676 } 677 678 /* 679 * An inotify event occurred on a watched vnode. 680 */ 681 void 682 vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp, 683 int event, uint32_t cookie) 684 { 685 int isdir; 686 687 VNPASS(vp->v_holdcnt > 0, vp); 688 689 isdir = vp->v_type == VDIR ? IN_ISDIR : 0; 690 691 if (dvp != NULL) { 692 VNPASS(dvp->v_holdcnt > 0, dvp); 693 694 /* 695 * Should we log an event for the vnode itself? 696 */ 697 if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) { 698 int selfevent; 699 700 switch (event) { 701 case _IN_MOVE_DELETE: 702 case IN_DELETE: 703 /* 704 * IN_DELETE_SELF is only generated when the 705 * last hard link of a file is removed. 706 */ 707 selfevent = IN_DELETE_SELF; 708 if (vp->v_type != VDIR) { 709 struct vattr va; 710 int error; 711 712 error = VOP_GETATTR(vp, &va, 713 cnp->cn_cred); 714 if (error == 0 && va.va_nlink != 0) 715 selfevent = 0; 716 } 717 break; 718 case IN_MOVED_FROM: 719 cookie = 0; 720 selfevent = IN_MOVE_SELF; 721 break; 722 case _IN_ATTRIB_LINKCOUNT: 723 selfevent = IN_ATTRIB; 724 break; 725 default: 726 selfevent = event; 727 break; 728 } 729 730 if ((selfevent & ~_IN_DIR_EVENTS) != 0) { 731 inotify_log(vp, NULL, 0, selfevent | isdir, 732 cookie); 733 } 734 } 735 736 /* 737 * Something is watching the directory through which this vnode 738 * was referenced, so we may need to log the event. 739 */ 740 if ((event & IN_ALL_EVENTS) != 0 && 741 (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) { 742 inotify_log(dvp, cnp->cn_nameptr, 743 cnp->cn_namelen, event | isdir, cookie); 744 } 745 } else { 746 /* 747 * We don't know which watched directory might contain the 748 * vnode, so we have to fall back to searching the name cache. 749 */ 750 cache_vop_inotify(vp, event, cookie); 751 } 752 } 753 754 int 755 vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask, 756 uint32_t *wdp, struct thread *td) 757 { 758 struct inotify_watch *watch, *watch1; 759 uint32_t wd; 760 761 /* 762 * If this is a directory, make sure all of its entries are present in 763 * the name cache so that we're able to look them up if an event occurs. 764 * The persistent reference on the directory prevents the outgoing name 765 * cache entries from being reclaimed. 766 */ 767 if (vp->v_type == VDIR) { 768 struct dirent *dp; 769 char *buf; 770 off_t off; 771 size_t buflen, len; 772 int eof, error; 773 774 buflen = 128 * sizeof(struct dirent); 775 buf = malloc(buflen, M_TEMP, M_WAITOK); 776 777 error = 0; 778 len = off = eof = 0; 779 for (;;) { 780 struct nameidata nd; 781 782 error = vn_dir_next_dirent(vp, td, buf, buflen, &dp, 783 &len, &off, &eof); 784 if (error != 0) 785 break; 786 if (len == 0) 787 /* Finished reading. */ 788 break; 789 if (strcmp(dp->d_name, ".") == 0 || 790 strcmp(dp->d_name, "..") == 0) 791 continue; 792 793 /* 794 * namei() consumes a reference on the starting 795 * directory if it's specified as a vnode. 796 */ 797 vrefact(vp); 798 VOP_UNLOCK(vp); 799 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, 800 dp->d_name, vp); 801 error = namei(&nd); 802 vn_lock(vp, LK_SHARED | LK_RETRY); 803 if (error != 0) 804 break; 805 NDFREE_PNBUF(&nd); 806 vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT); 807 vrele(nd.ni_vp); 808 } 809 free(buf, M_TEMP); 810 if (error != 0) 811 return (error); 812 } 813 814 /* 815 * The vnode referenced in kern_inotify_add_watch() might be different 816 * than this one if nullfs is in the picture. 817 */ 818 vrefact(vp); 819 watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO); 820 watch->sc = sc; 821 watch->vp = vp; 822 watch->mask = mask; 823 824 /* 825 * Are we updating an existing watch? Search the vnode's list rather 826 * than that of the softc, as the former is likely to be shorter. 827 */ 828 v_addpollinfo(vp); 829 mtx_lock(&vp->v_pollinfo->vpi_lock); 830 TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) { 831 if (watch1->sc == sc) 832 break; 833 } 834 mtx_lock(&sc->lock); 835 if (watch1 != NULL) { 836 mtx_unlock(&vp->v_pollinfo->vpi_lock); 837 838 /* 839 * We found an existing watch, update it based on our flags. 840 */ 841 if ((mask & IN_MASK_CREATE) != 0) { 842 mtx_unlock(&sc->lock); 843 vrele(vp); 844 free(watch, M_INOTIFY); 845 return (EEXIST); 846 } 847 if ((mask & IN_MASK_ADD) != 0) 848 watch1->mask |= mask; 849 else 850 watch1->mask = mask; 851 *wdp = watch1->wd; 852 mtx_unlock(&sc->lock); 853 vrele(vp); 854 free(watch, M_INOTIFY); 855 return (EJUSTRETURN); 856 } 857 858 /* 859 * We're creating a new watch. Add it to the softc and vnode watch 860 * lists. 861 */ 862 do { 863 struct inotify_watch key; 864 865 /* 866 * Search for the next available watch descriptor. This is 867 * implemented so as to avoid reusing watch descriptors for as 868 * long as possible. 869 */ 870 key.wd = wd = sc->nextwatch++; 871 watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key); 872 } while (watch1 != NULL || wd == 0); 873 watch->wd = wd; 874 RB_INSERT(inotify_watch_tree, &sc->watches, watch); 875 TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink); 876 mtx_unlock(&sc->lock); 877 mtx_unlock(&vp->v_pollinfo->vpi_lock); 878 vn_irflag_set_cond(vp, VIRF_INOTIFY); 879 880 *wdp = wd; 881 882 return (0); 883 } 884 885 void 886 vn_inotify_revoke(struct vnode *vp) 887 { 888 if (vp->v_pollinfo == NULL) { 889 /* This is a nullfs vnode which shadows a watched vnode. */ 890 return; 891 } 892 inotify_log(vp, NULL, 0, IN_UNMOUNT, 0); 893 } 894 895 static int 896 fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp, 897 struct file **fpp) 898 { 899 struct file *fp; 900 int error; 901 902 error = fget(td, fd, needrightsp, &fp); 903 if (error != 0) 904 return (error); 905 if (fp->f_type != DTYPE_INOTIFY) { 906 fdrop(fp, td); 907 return (EINVAL); 908 } 909 *fpp = fp; 910 return (0); 911 } 912 913 int 914 kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask, 915 struct thread *td) 916 { 917 struct nameidata nd; 918 struct file *fp; 919 struct inotify_softc *sc; 920 struct vnode *vp; 921 uint32_t wd; 922 int count, error; 923 924 fp = NULL; 925 vp = NULL; 926 927 if ((mask & IN_ALL_EVENTS) == 0) 928 return (EXTERROR(EINVAL, "no events specified")); 929 if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) == 930 (IN_MASK_ADD | IN_MASK_CREATE)) 931 return (EXTERROR(EINVAL, 932 "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive")); 933 if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0) 934 return (EXTERROR(EINVAL, "unrecognized flag")); 935 936 error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp); 937 if (error != 0) 938 return (error); 939 sc = fp->f_data; 940 941 NDINIT_AT(&nd, LOOKUP, 942 ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF | 943 LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd); 944 error = namei(&nd); 945 if (error != 0) 946 goto out; 947 NDFREE_PNBUF(&nd); 948 vp = nd.ni_vp; 949 950 error = VOP_ACCESS(vp, VREAD, td->td_ucred, td); 951 if (error != 0) 952 goto out; 953 954 if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) { 955 error = ENOTDIR; 956 goto out; 957 } 958 959 count = atomic_fetchadd_int(&inotify_watches, 1); 960 if (count > inotify_max_watches) { 961 atomic_subtract_int(&inotify_watches, 1); 962 error = ENOSPC; 963 goto out; 964 } 965 if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1, 966 inotify_max_user_watches)) { 967 atomic_subtract_int(&inotify_watches, 1); 968 error = ENOSPC; 969 goto out; 970 } 971 error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td); 972 if (error != 0) { 973 atomic_subtract_int(&inotify_watches, 1); 974 (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0); 975 if (error == EJUSTRETURN) { 976 /* We updated an existing watch, everything is ok. */ 977 error = 0; 978 } else { 979 goto out; 980 } 981 } 982 td->td_retval[0] = wd; 983 984 out: 985 if (vp != NULL) 986 vput(vp); 987 fdrop(fp, td); 988 return (error); 989 } 990 991 int 992 sys_inotify_add_watch_at(struct thread *td, 993 struct inotify_add_watch_at_args *uap) 994 { 995 return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path, 996 uap->mask, td)); 997 } 998 999 int 1000 kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td) 1001 { 1002 struct file *fp; 1003 struct inotify_softc *sc; 1004 struct inotify_record *rec; 1005 struct inotify_watch key, *watch; 1006 int error; 1007 1008 error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp); 1009 if (error != 0) 1010 return (error); 1011 sc = fp->f_data; 1012 1013 rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK); 1014 1015 /* 1016 * For compatibility with Linux, we do not remove pending events 1017 * associated with the watch. Watch descriptors are implemented so as 1018 * to avoid being reused for as long as possible, so one hopes that any 1019 * pending events from the removed watch descriptor will be removed 1020 * before the watch descriptor is recycled. 1021 */ 1022 key.wd = wd; 1023 mtx_lock(&sc->lock); 1024 watch = RB_FIND(inotify_watch_tree, &sc->watches, &key); 1025 if (watch == NULL) { 1026 free(rec, M_INOTIFY); 1027 error = EINVAL; 1028 } else { 1029 RB_REMOVE(inotify_watch_tree, &sc->watches, watch); 1030 if (!inotify_queue_record(sc, rec)) { 1031 free(rec, M_INOTIFY); 1032 error = 0; 1033 } 1034 } 1035 mtx_unlock(&sc->lock); 1036 if (watch != NULL) 1037 inotify_remove_watch(watch); 1038 fdrop(fp, td); 1039 return (error); 1040 } 1041 1042 int 1043 sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap) 1044 { 1045 return (kern_inotify_rm_watch(uap->fd, uap->wd, td)); 1046 } 1047