1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2025 Klara, Inc. 5 */ 6 7 #include "opt_ktrace.h" 8 9 #include <sys/param.h> 10 #include <sys/systm.h> 11 #include <sys/caprights.h> 12 #include <sys/counter.h> 13 #include <sys/dirent.h> 14 #define EXTERR_CATEGORY EXTERR_CAT_INOTIFY 15 #include <sys/exterrvar.h> 16 #include <sys/fcntl.h> 17 #include <sys/file.h> 18 #include <sys/filio.h> 19 #include <sys/inotify.h> 20 #include <sys/kernel.h> 21 #include <sys/lock.h> 22 #include <sys/ktrace.h> 23 #include <sys/malloc.h> 24 #include <sys/mutex.h> 25 #include <sys/namei.h> 26 #include <sys/poll.h> 27 #include <sys/proc.h> 28 #include <sys/queue.h> 29 #include <sys/resourcevar.h> 30 #include <sys/selinfo.h> 31 #include <sys/stat.h> 32 #include <sys/syscallsubr.h> 33 #include <sys/sysctl.h> 34 #include <sys/sysent.h> 35 #include <sys/syslimits.h> 36 #include <sys/sysproto.h> 37 #include <sys/taskqueue.h> 38 #include <sys/tree.h> 39 #include <sys/user.h> 40 #include <sys/vnode.h> 41 42 uint32_t inotify_rename_cookie; 43 44 static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 45 "inotify configuration"); 46 47 static int inotify_max_queued_events = 16384; 48 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN, 49 &inotify_max_queued_events, 0, 50 "Maximum number of events to queue on an inotify descriptor"); 51 52 static int inotify_max_user_instances = 256; 53 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN, 54 &inotify_max_user_instances, 0, 55 "Maximum number of inotify descriptors per user"); 56 57 static int inotify_max_user_watches; 58 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN, 59 &inotify_max_user_watches, 0, 60 "Maximum number of inotify watches per user"); 61 62 static int inotify_max_watches; 63 SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN, 64 &inotify_max_watches, 0, 65 "Maximum number of inotify watches system-wide"); 66 67 static int inotify_watches; 68 SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD, 69 &inotify_watches, 0, 70 "Total number of inotify watches currently in use"); 71 72 static int inotify_coalesce = 1; 73 SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN, 74 &inotify_coalesce, 0, 75 "Coalesce inotify events when possible"); 76 77 static COUNTER_U64_DEFINE_EARLY(inotify_event_drops); 78 SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD, 79 &inotify_event_drops, 80 "Number of inotify events dropped due to limits or allocation failures"); 81 82 static fo_rdwr_t inotify_read; 83 static fo_ioctl_t inotify_ioctl; 84 static fo_poll_t inotify_poll; 85 static fo_kqfilter_t inotify_kqfilter; 86 static fo_stat_t inotify_stat; 87 static fo_close_t inotify_close; 88 static fo_fill_kinfo_t inotify_fill_kinfo; 89 90 static const struct fileops inotifyfdops = { 91 .fo_read = inotify_read, 92 .fo_write = invfo_rdwr, 93 .fo_truncate = invfo_truncate, 94 .fo_ioctl = inotify_ioctl, 95 .fo_poll = inotify_poll, 96 .fo_kqfilter = inotify_kqfilter, 97 .fo_stat = inotify_stat, 98 .fo_close = inotify_close, 99 .fo_chmod = invfo_chmod, 100 .fo_chown = invfo_chown, 101 .fo_sendfile = invfo_sendfile, 102 .fo_fill_kinfo = inotify_fill_kinfo, 103 .fo_cmp = file_kcmp_generic, 104 .fo_flags = DFLAG_PASSABLE, 105 }; 106 107 static void filt_inotifydetach(struct knote *kn); 108 static int filt_inotifyevent(struct knote *kn, long hint); 109 110 static const struct filterops inotify_rfiltops = { 111 .f_isfd = 1, 112 .f_detach = filt_inotifydetach, 113 .f_event = filt_inotifyevent, 114 }; 115 116 static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures"); 117 118 struct inotify_record { 119 STAILQ_ENTRY(inotify_record) link; 120 struct inotify_event ev; 121 }; 122 123 static uint64_t inotify_ino = 1; 124 125 /* 126 * On LP64 systems this occupies 64 bytes, so we don't get internal 127 * fragmentation by allocating watches with malloc(9). If the size changes, 128 * consider using a UMA zone to improve memory efficiency. 129 */ 130 struct inotify_watch { 131 struct inotify_softc *sc; /* back-pointer */ 132 int wd; /* unique ID */ 133 uint32_t mask; /* event mask */ 134 struct vnode *vp; /* vnode being watched, refed */ 135 RB_ENTRY(inotify_watch) ilink; /* inotify linkage */ 136 TAILQ_ENTRY(inotify_watch) vlink; /* vnode linkage */ 137 }; 138 139 static void 140 inotify_init(void *arg __unused) 141 { 142 /* Don't let a user hold too many vnodes. */ 143 inotify_max_user_watches = desiredvnodes / 3; 144 /* Don't let the system hold too many vnodes. */ 145 inotify_max_watches = desiredvnodes / 2; 146 } 147 SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL); 148 149 static int 150 inotify_watch_cmp(const struct inotify_watch *a, 151 const struct inotify_watch *b) 152 { 153 if (a->wd < b->wd) 154 return (-1); 155 else if (a->wd > b->wd) 156 return (1); 157 else 158 return (0); 159 } 160 RB_HEAD(inotify_watch_tree, inotify_watch); 161 RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp); 162 163 struct inotify_softc { 164 struct mtx lock; /* serialize all softc writes */ 165 STAILQ_HEAD(, inotify_record) pending; /* events waiting to be read */ 166 struct inotify_record overflow; /* preallocated record */ 167 int nextwatch; /* next watch ID to try */ 168 int npending; /* number of pending events */ 169 size_t nbpending; /* bytes available to read */ 170 uint64_t ino; /* unique identifier */ 171 struct inotify_watch_tree watches; /* active watches */ 172 TAILQ_HEAD(, inotify_watch) deadwatches; /* watches pending vrele() */ 173 struct task reaptask; /* task to reap dead watches */ 174 struct selinfo sel; /* select/poll/kevent info */ 175 struct ucred *cred; /* credential ref */ 176 }; 177 178 static struct inotify_record * 179 inotify_dequeue(struct inotify_softc *sc) 180 { 181 struct inotify_record *rec; 182 183 mtx_assert(&sc->lock, MA_OWNED); 184 KASSERT(!STAILQ_EMPTY(&sc->pending), 185 ("%s: queue for %p is empty", __func__, sc)); 186 187 rec = STAILQ_FIRST(&sc->pending); 188 STAILQ_REMOVE_HEAD(&sc->pending, link); 189 sc->npending--; 190 sc->nbpending -= sizeof(rec->ev) + rec->ev.len; 191 return (rec); 192 } 193 194 static void 195 inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head) 196 { 197 mtx_assert(&sc->lock, MA_OWNED); 198 199 if (head) 200 STAILQ_INSERT_HEAD(&sc->pending, rec, link); 201 else 202 STAILQ_INSERT_TAIL(&sc->pending, rec, link); 203 sc->npending++; 204 sc->nbpending += sizeof(rec->ev) + rec->ev.len; 205 } 206 207 static int 208 inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags, 209 struct thread *td) 210 { 211 struct inotify_softc *sc; 212 struct inotify_record *rec; 213 int error; 214 bool first; 215 216 sc = fp->f_data; 217 error = 0; 218 219 mtx_lock(&sc->lock); 220 while (STAILQ_EMPTY(&sc->pending)) { 221 if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) { 222 mtx_unlock(&sc->lock); 223 return (EWOULDBLOCK); 224 } 225 error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0); 226 if (error != 0) { 227 mtx_unlock(&sc->lock); 228 return (error); 229 } 230 } 231 for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) { 232 size_t len; 233 234 rec = inotify_dequeue(sc); 235 len = sizeof(rec->ev) + rec->ev.len; 236 if (uio->uio_resid < (ssize_t)len) { 237 inotify_enqueue(sc, rec, true); 238 if (first) { 239 error = EXTERROR(EINVAL, 240 "read buffer is too small"); 241 } 242 break; 243 } 244 mtx_unlock(&sc->lock); 245 error = uiomove(&rec->ev, len, uio); 246 #ifdef KTRACE 247 if (error == 0 && KTRPOINT(td, KTR_STRUCT)) 248 ktrstruct("inotify", &rec->ev, len); 249 #endif 250 mtx_lock(&sc->lock); 251 if (error != 0) { 252 inotify_enqueue(sc, rec, true); 253 mtx_unlock(&sc->lock); 254 return (error); 255 } 256 if (rec == &sc->overflow) { 257 /* 258 * Signal to inotify_queue_record() that the overflow 259 * record can be reused. 260 */ 261 memset(rec, 0, sizeof(*rec)); 262 } else { 263 free(rec, M_INOTIFY); 264 } 265 } 266 mtx_unlock(&sc->lock); 267 return (error); 268 } 269 270 static int 271 inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred, 272 struct thread *td) 273 { 274 struct inotify_softc *sc; 275 276 sc = fp->f_data; 277 278 switch (com) { 279 case FIONREAD: 280 *(int *)data = (int)sc->nbpending; 281 return (0); 282 case FIONBIO: 283 case FIOASYNC: 284 return (0); 285 default: 286 return (ENOTTY); 287 } 288 289 return (0); 290 } 291 292 static int 293 inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td) 294 { 295 struct inotify_softc *sc; 296 int revents; 297 298 sc = fp->f_data; 299 revents = 0; 300 301 mtx_lock(&sc->lock); 302 if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0) 303 revents |= events & (POLLIN | POLLRDNORM); 304 else 305 selrecord(td, &sc->sel); 306 mtx_unlock(&sc->lock); 307 return (revents); 308 } 309 310 static void 311 filt_inotifydetach(struct knote *kn) 312 { 313 struct inotify_softc *sc; 314 315 sc = kn->kn_hook; 316 knlist_remove(&sc->sel.si_note, kn, 0); 317 } 318 319 static int 320 filt_inotifyevent(struct knote *kn, long hint) 321 { 322 struct inotify_softc *sc; 323 324 sc = kn->kn_hook; 325 mtx_assert(&sc->lock, MA_OWNED); 326 kn->kn_data = sc->nbpending; 327 return (kn->kn_data > 0); 328 } 329 330 static int 331 inotify_kqfilter(struct file *fp, struct knote *kn) 332 { 333 struct inotify_softc *sc; 334 335 if (kn->kn_filter != EVFILT_READ) 336 return (EINVAL); 337 sc = fp->f_data; 338 kn->kn_fop = &inotify_rfiltops; 339 kn->kn_hook = sc; 340 knlist_add(&sc->sel.si_note, kn, 0); 341 return (0); 342 } 343 344 static int 345 inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred) 346 { 347 struct inotify_softc *sc; 348 349 sc = fp->f_data; 350 351 memset(sb, 0, sizeof(*sb)); 352 sb->st_mode = S_IFREG | S_IRUSR; 353 sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX); 354 mtx_lock(&sc->lock); 355 sb->st_size = sc->nbpending; 356 sb->st_blocks = sc->npending; 357 sb->st_uid = sc->cred->cr_ruid; 358 sb->st_gid = sc->cred->cr_rgid; 359 sb->st_ino = sc->ino; 360 mtx_unlock(&sc->lock); 361 return (0); 362 } 363 364 static void 365 inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watch) 366 { 367 struct vnode *vp; 368 369 vp = watch->vp; 370 mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED); 371 372 atomic_subtract_int(&inotify_watches, 1); 373 (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0); 374 375 TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink); 376 if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify)) 377 vn_irflag_unset(vp, VIRF_INOTIFY); 378 } 379 380 static void 381 inotify_free_watch(struct inotify_watch *watch) 382 { 383 vrele(watch->vp); 384 free(watch, M_INOTIFY); 385 } 386 387 /* 388 * Assumes that the watch has already been removed from its softc. 389 */ 390 static void 391 inotify_remove_watch(struct inotify_watch *watch) 392 { 393 struct inotify_softc *sc; 394 struct vnode *vp; 395 396 sc = watch->sc; 397 398 vp = watch->vp; 399 mtx_lock(&vp->v_pollinfo->vpi_lock); 400 inotify_unlink_watch_locked(sc, watch); 401 mtx_unlock(&vp->v_pollinfo->vpi_lock); 402 inotify_free_watch(watch); 403 } 404 405 static void 406 inotify_reap(void *arg, int pending) 407 { 408 struct inotify_softc *sc; 409 struct inotify_watch *watch; 410 411 sc = arg; 412 mtx_lock(&sc->lock); 413 while ((watch = TAILQ_FIRST(&sc->deadwatches)) != NULL) { 414 TAILQ_REMOVE(&sc->deadwatches, watch, vlink); 415 mtx_unlock(&sc->lock); 416 inotify_free_watch(watch); 417 mtx_lock(&sc->lock); 418 } 419 mtx_unlock(&sc->lock); 420 } 421 422 static int 423 inotify_close(struct file *fp, struct thread *td) 424 { 425 struct inotify_softc *sc; 426 struct inotify_record *rec; 427 struct inotify_watch *watch; 428 429 sc = fp->f_data; 430 431 /* Detach watches from their vnodes. */ 432 mtx_lock(&sc->lock); 433 (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0); 434 while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) { 435 RB_REMOVE(inotify_watch_tree, &sc->watches, watch); 436 mtx_unlock(&sc->lock); 437 inotify_remove_watch(watch); 438 mtx_lock(&sc->lock); 439 } 440 441 /* Make sure that any asynchronous vrele() calls are done. */ 442 mtx_unlock(&sc->lock); 443 taskqueue_drain(taskqueue_thread, &sc->reaptask); 444 mtx_lock(&sc->lock); 445 KASSERT(RB_EMPTY(&sc->watches), 446 ("%s: watches not empty in %p", __func__, sc)); 447 KASSERT(TAILQ_EMPTY(&sc->deadwatches), 448 ("%s: deadwatches not empty in %p", __func__, sc)); 449 450 /* Drop pending events. */ 451 while (!STAILQ_EMPTY(&sc->pending)) { 452 rec = inotify_dequeue(sc); 453 if (rec != &sc->overflow) 454 free(rec, M_INOTIFY); 455 } 456 mtx_unlock(&sc->lock); 457 seldrain(&sc->sel); 458 knlist_destroy(&sc->sel.si_note); 459 mtx_destroy(&sc->lock); 460 crfree(sc->cred); 461 free(sc, M_INOTIFY); 462 return (0); 463 } 464 465 static int 466 inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif, 467 struct filedesc *fdp) 468 { 469 struct inotify_softc *sc; 470 471 sc = fp->f_data; 472 473 kif->kf_type = KF_TYPE_INOTIFY; 474 kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending; 475 kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending; 476 return (0); 477 } 478 479 int 480 inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp) 481 { 482 struct inotify_softc *sc; 483 int fflags; 484 485 if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0) 486 return (EINVAL); 487 488 if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1, 489 inotify_max_user_instances)) 490 return (EMFILE); 491 492 sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO); 493 sc->nextwatch = 1; /* Required for compatibility. */ 494 STAILQ_INIT(&sc->pending); 495 RB_INIT(&sc->watches); 496 TAILQ_INIT(&sc->deadwatches); 497 TASK_INIT(&sc->reaptask, 0, inotify_reap, sc); 498 mtx_init(&sc->lock, "inotify", NULL, MTX_DEF); 499 knlist_init_mtx(&sc->sel.si_note, &sc->lock); 500 sc->cred = crhold(td->td_ucred); 501 sc->ino = atomic_fetchadd_64(&inotify_ino, 1); 502 503 fflags = FREAD; 504 if ((flags & IN_NONBLOCK) != 0) 505 fflags |= FNONBLOCK; 506 if ((flags & IN_CLOEXEC) != 0) 507 *fflagsp |= O_CLOEXEC; 508 finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops); 509 510 return (0); 511 } 512 513 static struct inotify_record * 514 inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event, 515 uint32_t cookie, int waitok) 516 { 517 struct inotify_event *evp; 518 struct inotify_record *rec; 519 520 rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY, 521 waitok | M_ZERO); 522 if (rec == NULL) 523 return (NULL); 524 evp = &rec->ev; 525 evp->wd = wd; 526 evp->mask = event; 527 evp->cookie = cookie; 528 evp->len = _IN_NAMESIZE(namelen); 529 if (name != NULL) 530 memcpy(evp->name, name, namelen); 531 return (rec); 532 } 533 534 static bool 535 inotify_can_coalesce(struct inotify_softc *sc, struct inotify_event *evp) 536 { 537 struct inotify_record *prev; 538 539 mtx_assert(&sc->lock, MA_OWNED); 540 541 prev = STAILQ_LAST(&sc->pending, inotify_record, link); 542 return (prev != NULL && prev->ev.mask == evp->mask && 543 prev->ev.wd == evp->wd && prev->ev.cookie == evp->cookie && 544 prev->ev.len == evp->len && 545 memcmp(prev->ev.name, evp->name, evp->len) == 0); 546 } 547 548 static void 549 inotify_overflow_event(struct inotify_event *evp) 550 { 551 evp->mask = IN_Q_OVERFLOW; 552 evp->wd = -1; 553 evp->cookie = 0; 554 evp->len = 0; 555 } 556 557 /* 558 * Put an event record on the queue for an inotify desscriptor. Return false if 559 * the record was not enqueued for some reason, true otherwise. 560 */ 561 static bool 562 inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec) 563 { 564 struct inotify_event *evp; 565 566 mtx_assert(&sc->lock, MA_OWNED); 567 568 evp = &rec->ev; 569 if (__predict_false(rec == &sc->overflow)) { 570 /* 571 * Is the overflow record already in the queue? If so, there's 572 * not much else we can do: we're here because a kernel memory 573 * shortage prevented new record allocations. 574 */ 575 counter_u64_add(inotify_event_drops, 1); 576 if (evp->mask == IN_Q_OVERFLOW) 577 return (false); 578 inotify_overflow_event(evp); 579 } else { 580 /* Try to coalesce duplicate events. */ 581 if (inotify_coalesce && inotify_can_coalesce(sc, evp)) 582 return (false); 583 584 /* 585 * Would this one overflow the queue? If so, convert it to an 586 * overflow event and try again to coalesce. 587 */ 588 if (sc->npending >= inotify_max_queued_events) { 589 counter_u64_add(inotify_event_drops, 1); 590 inotify_overflow_event(evp); 591 if (inotify_can_coalesce(sc, evp)) 592 return (false); 593 } 594 } 595 inotify_enqueue(sc, rec, false); 596 selwakeup(&sc->sel); 597 KNOTE_LOCKED(&sc->sel.si_note, 0); 598 wakeup(&sc->pending); 599 return (true); 600 } 601 602 static void 603 inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen, 604 int event, uint32_t cookie) 605 { 606 struct inotify_watch key; 607 struct inotify_softc *sc; 608 struct inotify_record *rec; 609 bool allocfail; 610 611 mtx_assert(&watch->vp->v_pollinfo->vpi_lock, MA_OWNED); 612 613 sc = watch->sc; 614 rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie, 615 M_NOWAIT); 616 if (rec == NULL) { 617 rec = &sc->overflow; 618 allocfail = true; 619 } else { 620 allocfail = false; 621 } 622 623 mtx_lock(&sc->lock); 624 if (!inotify_queue_record(sc, rec) && rec != &sc->overflow) 625 free(rec, M_INOTIFY); 626 if ((watch->mask & IN_ONESHOT) != 0 || 627 (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) { 628 if (!allocfail) { 629 rec = inotify_alloc_record(watch->wd, NULL, 0, 630 IN_IGNORED, 0, M_NOWAIT); 631 if (rec == NULL) 632 rec = &sc->overflow; 633 if (!inotify_queue_record(sc, rec) && 634 rec != &sc->overflow) 635 free(rec, M_INOTIFY); 636 } 637 638 /* 639 * Remove the watch, taking care to handle races with 640 * inotify_close(). The thread that removes the watch is 641 * responsible for freeing it. 642 */ 643 key.wd = watch->wd; 644 if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) { 645 RB_REMOVE(inotify_watch_tree, &sc->watches, watch); 646 inotify_unlink_watch_locked(sc, watch); 647 648 /* 649 * Defer the vrele() to a sleepable thread context. 650 */ 651 TAILQ_INSERT_TAIL(&sc->deadwatches, watch, vlink); 652 taskqueue_enqueue(taskqueue_thread, &sc->reaptask); 653 } 654 } 655 mtx_unlock(&sc->lock); 656 } 657 658 void 659 inotify_log(struct vnode *vp, const char *name, size_t namelen, int event, 660 uint32_t cookie) 661 { 662 struct inotify_watch *watch, *tmp; 663 664 KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0, 665 ("inotify_log: invalid event %#x", event)); 666 667 mtx_lock(&vp->v_pollinfo->vpi_lock); 668 TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) { 669 KASSERT(watch->vp == vp, 670 ("inotify_log: watch %p vp != vp", watch)); 671 if ((watch->mask & event) != 0 || event == IN_UNMOUNT) 672 inotify_log_one(watch, name, namelen, event, cookie); 673 } 674 mtx_unlock(&vp->v_pollinfo->vpi_lock); 675 } 676 677 /* 678 * An inotify event occurred on a watched vnode. 679 */ 680 void 681 vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp, 682 int event, uint32_t cookie) 683 { 684 int isdir; 685 686 VNPASS(vp->v_holdcnt > 0, vp); 687 688 isdir = vp->v_type == VDIR ? IN_ISDIR : 0; 689 690 if (dvp != NULL) { 691 VNPASS(dvp->v_holdcnt > 0, dvp); 692 693 /* 694 * Should we log an event for the vnode itself? 695 */ 696 if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) { 697 int selfevent; 698 699 switch (event) { 700 case _IN_MOVE_DELETE: 701 case IN_DELETE: 702 /* 703 * IN_DELETE_SELF is only generated when the 704 * last hard link of a file is removed. 705 */ 706 selfevent = IN_DELETE_SELF; 707 if (vp->v_type != VDIR) { 708 struct vattr va; 709 int error; 710 711 error = VOP_GETATTR(vp, &va, 712 cnp->cn_cred); 713 if (error == 0 && va.va_nlink != 0) 714 selfevent = 0; 715 } 716 break; 717 case IN_MOVED_FROM: 718 cookie = 0; 719 selfevent = IN_MOVE_SELF; 720 break; 721 case _IN_ATTRIB_LINKCOUNT: 722 selfevent = IN_ATTRIB; 723 break; 724 default: 725 selfevent = event; 726 break; 727 } 728 729 if ((selfevent & ~_IN_DIR_EVENTS) != 0) { 730 inotify_log(vp, NULL, 0, selfevent | isdir, 731 cookie); 732 } 733 } 734 735 /* 736 * Something is watching the directory through which this vnode 737 * was referenced, so we may need to log the event. 738 */ 739 if ((event & IN_ALL_EVENTS) != 0 && 740 (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) { 741 inotify_log(dvp, cnp->cn_nameptr, 742 cnp->cn_namelen, event | isdir, cookie); 743 } 744 } else { 745 /* 746 * We don't know which watched directory might contain the 747 * vnode, so we have to fall back to searching the name cache. 748 */ 749 cache_vop_inotify(vp, event, cookie); 750 } 751 } 752 753 int 754 vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask, 755 uint32_t *wdp, struct thread *td) 756 { 757 struct inotify_watch *watch, *watch1; 758 uint32_t wd; 759 760 /* 761 * If this is a directory, make sure all of its entries are present in 762 * the name cache so that we're able to look them up if an event occurs. 763 * The persistent reference on the directory prevents the outgoing name 764 * cache entries from being reclaimed. 765 */ 766 if (vp->v_type == VDIR) { 767 struct dirent *dp; 768 char *buf; 769 off_t off; 770 size_t buflen, len; 771 int eof, error; 772 773 buflen = 128 * sizeof(struct dirent); 774 buf = malloc(buflen, M_TEMP, M_WAITOK); 775 776 error = 0; 777 len = off = eof = 0; 778 for (;;) { 779 struct nameidata nd; 780 781 error = vn_dir_next_dirent(vp, td, buf, buflen, &dp, 782 &len, &off, &eof); 783 if (error != 0) 784 break; 785 if (len == 0) 786 /* Finished reading. */ 787 break; 788 if (strcmp(dp->d_name, ".") == 0 || 789 strcmp(dp->d_name, "..") == 0) 790 continue; 791 792 /* 793 * namei() consumes a reference on the starting 794 * directory if it's specified as a vnode. 795 */ 796 vrefact(vp); 797 VOP_UNLOCK(vp); 798 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, 799 dp->d_name, vp); 800 error = namei(&nd); 801 vn_lock(vp, LK_SHARED | LK_RETRY); 802 if (error != 0) 803 break; 804 vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT); 805 vrele(nd.ni_vp); 806 } 807 free(buf, M_TEMP); 808 if (error != 0) 809 return (error); 810 } 811 812 /* 813 * The vnode referenced in kern_inotify_add_watch() might be different 814 * than this one if nullfs is in the picture. 815 */ 816 vrefact(vp); 817 watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO); 818 watch->sc = sc; 819 watch->vp = vp; 820 watch->mask = mask; 821 822 /* 823 * Are we updating an existing watch? Search the vnode's list rather 824 * than that of the softc, as the former is likely to be shorter. 825 */ 826 v_addpollinfo(vp); 827 mtx_lock(&vp->v_pollinfo->vpi_lock); 828 TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) { 829 if (watch1->sc == sc) 830 break; 831 } 832 mtx_lock(&sc->lock); 833 if (watch1 != NULL) { 834 mtx_unlock(&vp->v_pollinfo->vpi_lock); 835 836 /* 837 * We found an existing watch, update it based on our flags. 838 */ 839 if ((mask & IN_MASK_CREATE) != 0) { 840 mtx_unlock(&sc->lock); 841 vrele(vp); 842 free(watch, M_INOTIFY); 843 return (EEXIST); 844 } 845 if ((mask & IN_MASK_ADD) != 0) 846 watch1->mask |= mask; 847 else 848 watch1->mask = mask; 849 *wdp = watch1->wd; 850 mtx_unlock(&sc->lock); 851 vrele(vp); 852 free(watch, M_INOTIFY); 853 return (EJUSTRETURN); 854 } 855 856 /* 857 * We're creating a new watch. Add it to the softc and vnode watch 858 * lists. 859 */ 860 do { 861 struct inotify_watch key; 862 863 /* 864 * Search for the next available watch descriptor. This is 865 * implemented so as to avoid reusing watch descriptors for as 866 * long as possible. 867 */ 868 key.wd = wd = sc->nextwatch++; 869 watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key); 870 } while (watch1 != NULL || wd == 0); 871 watch->wd = wd; 872 RB_INSERT(inotify_watch_tree, &sc->watches, watch); 873 TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink); 874 mtx_unlock(&sc->lock); 875 mtx_unlock(&vp->v_pollinfo->vpi_lock); 876 vn_irflag_set_cond(vp, VIRF_INOTIFY); 877 878 *wdp = wd; 879 880 return (0); 881 } 882 883 void 884 vn_inotify_revoke(struct vnode *vp) 885 { 886 if (vp->v_pollinfo == NULL) { 887 /* This is a nullfs vnode which shadows a watched vnode. */ 888 return; 889 } 890 inotify_log(vp, NULL, 0, IN_UNMOUNT, 0); 891 } 892 893 static int 894 fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp, 895 struct file **fpp) 896 { 897 struct file *fp; 898 int error; 899 900 error = fget(td, fd, needrightsp, &fp); 901 if (error != 0) 902 return (error); 903 if (fp->f_type != DTYPE_INOTIFY) { 904 fdrop(fp, td); 905 return (EINVAL); 906 } 907 *fpp = fp; 908 return (0); 909 } 910 911 int 912 kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask, 913 struct thread *td) 914 { 915 struct nameidata nd; 916 struct file *fp; 917 struct inotify_softc *sc; 918 struct vnode *vp; 919 uint32_t wd; 920 int count, error; 921 922 fp = NULL; 923 vp = NULL; 924 925 if ((mask & IN_ALL_EVENTS) == 0) 926 return (EXTERROR(EINVAL, "no events specified")); 927 if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) == 928 (IN_MASK_ADD | IN_MASK_CREATE)) 929 return (EXTERROR(EINVAL, 930 "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive")); 931 if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0) 932 return (EXTERROR(EINVAL, "unrecognized flag")); 933 934 error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp); 935 if (error != 0) 936 return (error); 937 sc = fp->f_data; 938 939 NDINIT_AT(&nd, LOOKUP, 940 ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF | 941 LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd); 942 error = namei(&nd); 943 if (error != 0) 944 goto out; 945 NDFREE_PNBUF(&nd); 946 vp = nd.ni_vp; 947 948 error = VOP_ACCESS(vp, VREAD, td->td_ucred, td); 949 if (error != 0) 950 goto out; 951 952 if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) { 953 error = ENOTDIR; 954 goto out; 955 } 956 957 count = atomic_fetchadd_int(&inotify_watches, 1); 958 if (count > inotify_max_watches) { 959 atomic_subtract_int(&inotify_watches, 1); 960 error = ENOSPC; 961 goto out; 962 } 963 if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1, 964 inotify_max_user_watches)) { 965 atomic_subtract_int(&inotify_watches, 1); 966 error = ENOSPC; 967 goto out; 968 } 969 error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td); 970 if (error != 0) { 971 atomic_subtract_int(&inotify_watches, 1); 972 (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0); 973 if (error == EJUSTRETURN) { 974 /* We updated an existing watch, everything is ok. */ 975 error = 0; 976 } else { 977 goto out; 978 } 979 } 980 td->td_retval[0] = wd; 981 982 out: 983 if (vp != NULL) 984 vput(vp); 985 fdrop(fp, td); 986 return (error); 987 } 988 989 int 990 sys_inotify_add_watch_at(struct thread *td, 991 struct inotify_add_watch_at_args *uap) 992 { 993 return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path, 994 uap->mask, td)); 995 } 996 997 int 998 kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td) 999 { 1000 struct file *fp; 1001 struct inotify_softc *sc; 1002 struct inotify_record *rec; 1003 struct inotify_watch key, *watch; 1004 int error; 1005 1006 error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp); 1007 if (error != 0) 1008 return (error); 1009 sc = fp->f_data; 1010 1011 rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK); 1012 1013 /* 1014 * For compatibility with Linux, we do not remove pending events 1015 * associated with the watch. Watch descriptors are implemented so as 1016 * to avoid being reused for as long as possible, so one hopes that any 1017 * pending events from the removed watch descriptor will be removed 1018 * before the watch descriptor is recycled. 1019 */ 1020 key.wd = wd; 1021 mtx_lock(&sc->lock); 1022 watch = RB_FIND(inotify_watch_tree, &sc->watches, &key); 1023 if (watch == NULL) { 1024 free(rec, M_INOTIFY); 1025 error = EINVAL; 1026 } else { 1027 RB_REMOVE(inotify_watch_tree, &sc->watches, watch); 1028 if (!inotify_queue_record(sc, rec)) { 1029 free(rec, M_INOTIFY); 1030 error = 0; 1031 } 1032 } 1033 mtx_unlock(&sc->lock); 1034 if (watch != NULL) 1035 inotify_remove_watch(watch); 1036 fdrop(fp, td); 1037 return (error); 1038 } 1039 1040 int 1041 sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap) 1042 { 1043 return (kern_inotify_rm_watch(uap->fd, uap->wd, td)); 1044 } 1045