1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2007-2009 Google Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above 14 * copyright notice, this list of conditions and the following disclaimer 15 * in the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Google Inc. nor the names of its 18 * contributors may be used to endorse or promote products derived from 19 * this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Copyright (C) 2005 Csaba Henk. 34 * All rights reserved. 35 * 36 * Copyright (c) 2019 The FreeBSD Foundation 37 * 38 * Portions of this software were developed by BFF Storage Systems, LLC under 39 * sponsorship from the FreeBSD Foundation. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 50 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 */ 62 63 #include <sys/types.h> 64 #include <sys/param.h> 65 #include <sys/module.h> 66 #include <sys/systm.h> 67 #include <sys/errno.h> 68 #include <sys/kernel.h> 69 #include <sys/conf.h> 70 #include <sys/uio.h> 71 #include <sys/malloc.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/sx.h> 75 #include <sys/mutex.h> 76 #include <sys/proc.h> 77 #include <sys/mount.h> 78 #include <sys/sdt.h> 79 #include <sys/stat.h> 80 #include <sys/fcntl.h> 81 #include <sys/sysctl.h> 82 #include <sys/poll.h> 83 #include <sys/selinfo.h> 84 #define EXTERR_CATEGORY EXTERR_CAT_FUSE_DEVICE 85 #include <sys/exterrvar.h> 86 87 #include "fuse.h" 88 #include "fuse_internal.h" 89 #include "fuse_ipc.h" 90 91 #include <compat/linux/linux_errno.h> 92 #include <compat/linux/linux_errno.inc> 93 94 SDT_PROVIDER_DECLARE(fusefs); 95 /* 96 * Fuse trace probe: 97 * arg0: verbosity. Higher numbers give more verbose messages 98 * arg1: Textual message 99 */ 100 SDT_PROBE_DEFINE2(fusefs, , device, trace, "int", "char*"); 101 102 static struct cdev *fuse_dev; 103 104 static d_kqfilter_t fuse_device_filter; 105 static d_open_t fuse_device_open; 106 static d_poll_t fuse_device_poll; 107 static d_read_t fuse_device_read; 108 static d_write_t fuse_device_write; 109 110 static struct cdevsw fuse_device_cdevsw = { 111 .d_kqfilter = fuse_device_filter, 112 .d_open = fuse_device_open, 113 .d_name = "fuse", 114 .d_poll = fuse_device_poll, 115 .d_read = fuse_device_read, 116 .d_write = fuse_device_write, 117 .d_version = D_VERSION, 118 }; 119 120 static int fuse_device_filt_read(struct knote *kn, long hint); 121 static int fuse_device_filt_write(struct knote *kn, long hint); 122 static void fuse_device_filt_detach(struct knote *kn); 123 124 static const struct filterops fuse_device_rfiltops = { 125 .f_isfd = 1, 126 .f_detach = fuse_device_filt_detach, 127 .f_event = fuse_device_filt_read, 128 .f_copy = knote_triv_copy, 129 }; 130 131 static const struct filterops fuse_device_wfiltops = { 132 .f_isfd = 1, 133 .f_event = fuse_device_filt_write, 134 .f_copy = knote_triv_copy, 135 }; 136 137 /**************************** 138 * 139 * >>> Fuse device op defs 140 * 141 ****************************/ 142 143 static void 144 fdata_dtor(void *arg) 145 { 146 struct fuse_data *fdata; 147 struct fuse_ticket *tick; 148 149 fdata = arg; 150 if (fdata == NULL) 151 return; 152 153 fdata_set_dead(fdata); 154 155 FUSE_LOCK(); 156 fuse_lck_mtx_lock(fdata->aw_mtx); 157 /* wakup poll()ers */ 158 selwakeuppri(&fdata->ks_rsel, PZERO); 159 /* Don't let syscall handlers wait in vain */ 160 while ((tick = fuse_aw_pop(fdata))) { 161 fuse_lck_mtx_lock(tick->tk_aw_mtx); 162 fticket_set_answered(tick); 163 tick->tk_aw_errno = ENOTCONN; 164 wakeup(tick); 165 fuse_lck_mtx_unlock(tick->tk_aw_mtx); 166 FUSE_ASSERT_AW_DONE(tick); 167 fuse_ticket_drop(tick); 168 } 169 fuse_lck_mtx_unlock(fdata->aw_mtx); 170 171 /* Cleanup unsent operations */ 172 fuse_lck_mtx_lock(fdata->ms_mtx); 173 while ((tick = fuse_ms_pop(fdata))) { 174 fuse_ticket_drop(tick); 175 } 176 fuse_lck_mtx_unlock(fdata->ms_mtx); 177 FUSE_UNLOCK(); 178 179 if (fdata->mp && fdata->dataflags & FSESS_AUTO_UNMOUNT) { 180 vfs_ref(fdata->mp); 181 dounmount(fdata->mp, MNT_FORCE, curthread); 182 } 183 184 fdata_trydestroy(fdata); 185 } 186 187 static int 188 fuse_device_filter(struct cdev *dev, struct knote *kn) 189 { 190 struct fuse_data *data; 191 int error; 192 193 error = devfs_get_cdevpriv((void **)&data); 194 195 if (error == 0 && kn->kn_filter == EVFILT_READ) { 196 kn->kn_fop = &fuse_device_rfiltops; 197 kn->kn_hook = data; 198 knlist_add(&data->ks_rsel.si_note, kn, 0); 199 error = 0; 200 } else if (error == 0 && kn->kn_filter == EVFILT_WRITE) { 201 kn->kn_fop = &fuse_device_wfiltops; 202 error = 0; 203 } else if (error == 0) { 204 error = EXTERROR(EINVAL, "Unsupported kevent filter"); 205 kn->kn_data = error; 206 } 207 208 return (error); 209 } 210 211 static void 212 fuse_device_filt_detach(struct knote *kn) 213 { 214 struct fuse_data *data; 215 216 data = (struct fuse_data*)kn->kn_hook; 217 MPASS(data != NULL); 218 knlist_remove(&data->ks_rsel.si_note, kn, 0); 219 kn->kn_hook = NULL; 220 } 221 222 static int 223 fuse_device_filt_read(struct knote *kn, long hint) 224 { 225 struct fuse_data *data; 226 int ready; 227 228 data = (struct fuse_data*)kn->kn_hook; 229 MPASS(data != NULL); 230 231 mtx_assert(&data->ms_mtx, MA_OWNED); 232 if (fdata_get_dead(data)) { 233 kn->kn_flags |= EV_EOF; 234 kn->kn_fflags = ENODEV; 235 kn->kn_data = 1; 236 ready = 1; 237 } else if (STAILQ_FIRST(&data->ms_head)) { 238 MPASS(data->ms_count >= 1); 239 kn->kn_data = data->ms_count; 240 ready = 1; 241 } else { 242 ready = 0; 243 } 244 245 return (ready); 246 } 247 248 static int 249 fuse_device_filt_write(struct knote *kn, long hint) 250 { 251 252 kn->kn_data = 0; 253 254 /* The device is always ready to write, so we return 1*/ 255 return (1); 256 } 257 258 /* 259 * Resources are set up on a per-open basis 260 */ 261 static int 262 fuse_device_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 263 { 264 struct fuse_data *fdata; 265 int error; 266 267 SDT_PROBE2(fusefs, , device, trace, 1, "device open"); 268 269 fdata = fdata_alloc(dev, td->td_ucred); 270 error = devfs_set_cdevpriv(fdata, fdata_dtor); 271 if (error != 0) 272 fdata_trydestroy(fdata); 273 else 274 SDT_PROBE2(fusefs, , device, trace, 1, "device open success"); 275 return (error); 276 } 277 278 int 279 fuse_device_poll(struct cdev *dev, int events, struct thread *td) 280 { 281 struct fuse_data *data; 282 int error, revents = 0; 283 284 error = devfs_get_cdevpriv((void **)&data); 285 if (error != 0) 286 return (events & 287 (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); 288 289 if (events & (POLLIN | POLLRDNORM)) { 290 fuse_lck_mtx_lock(data->ms_mtx); 291 if (fdata_get_dead(data) || STAILQ_FIRST(&data->ms_head)) 292 revents |= events & (POLLIN | POLLRDNORM); 293 else 294 selrecord(td, &data->ks_rsel); 295 fuse_lck_mtx_unlock(data->ms_mtx); 296 } 297 if (events & (POLLOUT | POLLWRNORM)) { 298 revents |= events & (POLLOUT | POLLWRNORM); 299 } 300 return (revents); 301 } 302 303 /* 304 * fuse_device_read hangs on the queue of VFS messages. 305 * When it's notified that there is a new one, it picks that and 306 * passes up to the daemon 307 */ 308 int 309 fuse_device_read(struct cdev *dev, struct uio *uio, int ioflag) 310 { 311 int err; 312 struct fuse_data *data; 313 struct fuse_ticket *tick; 314 void *buf; 315 int buflen; 316 317 SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read"); 318 319 err = devfs_get_cdevpriv((void **)&data); 320 if (err != 0) 321 return (err); 322 323 fuse_lck_mtx_lock(data->ms_mtx); 324 again: 325 if (fdata_get_dead(data)) { 326 SDT_PROBE2(fusefs, , device, trace, 2, 327 "we know early on that reader should be kicked so we " 328 "don't wait for news"); 329 fuse_lck_mtx_unlock(data->ms_mtx); 330 return (EXTERROR(ENODEV, "This FUSE session is about to be closed")); 331 } 332 if (!(tick = fuse_ms_pop(data))) { 333 /* check if we may block */ 334 if (ioflag & O_NONBLOCK) { 335 /* get outa here soon */ 336 fuse_lck_mtx_unlock(data->ms_mtx); 337 return (EAGAIN); 338 } else { 339 err = msleep(data, &data->ms_mtx, PCATCH, "fu_msg", 0); 340 if (err != 0) { 341 fuse_lck_mtx_unlock(data->ms_mtx); 342 if (fdata_get_dead(data)) 343 err = EXTERROR(ENODEV, 344 "This FUSE session is about to be closed"); 345 return (err); 346 } 347 tick = fuse_ms_pop(data); 348 } 349 } 350 if (!tick) { 351 /* 352 * We can get here if fuse daemon suddenly terminates, 353 * eg, by being hit by a SIGKILL 354 * -- and some other cases, too, tho not totally clear, when 355 * (cv_signal/wakeup_one signals the whole process ?) 356 */ 357 SDT_PROBE2(fusefs, , device, trace, 1, "no message on thread"); 358 goto again; 359 } 360 fuse_lck_mtx_unlock(data->ms_mtx); 361 362 if (fdata_get_dead(data)) { 363 /* 364 * somebody somewhere -- eg., umount routine -- 365 * wants this liaison finished off 366 */ 367 SDT_PROBE2(fusefs, , device, trace, 2, 368 "reader is to be sacked"); 369 if (tick) { 370 SDT_PROBE2(fusefs, , device, trace, 2, "weird -- " 371 "\"kick\" is set tho there is message"); 372 FUSE_ASSERT_MS_DONE(tick); 373 fuse_ticket_drop(tick); 374 } 375 /* This should make the daemon get off of us */ 376 return (EXTERROR(ENODEV, "This FUSE session is about to be closed")); 377 } 378 SDT_PROBE2(fusefs, , device, trace, 1, 379 "fuse device read message successfully"); 380 381 buf = tick->tk_ms_fiov.base; 382 buflen = tick->tk_ms_fiov.len; 383 384 /* 385 * Why not ban mercilessly stupid daemons who can't keep up 386 * with us? (There is no much use of a partial read here...) 387 */ 388 /* 389 * XXX note that in such cases Linux FUSE throws EIO at the 390 * syscall invoker and stands back to the message queue. The 391 * rationale should be made clear (and possibly adopt that 392 * behaviour). Keeping the current scheme at least makes 393 * fallacy as loud as possible... 394 */ 395 if (uio->uio_resid < buflen) { 396 fdata_set_dead(data); 397 SDT_PROBE2(fusefs, , device, trace, 2, 398 "daemon is stupid, kick it off..."); 399 err = EXTERROR(ENODEV, "Partial read attempted"); 400 } else { 401 err = uiomove(buf, buflen, uio); 402 } 403 404 FUSE_ASSERT_MS_DONE(tick); 405 fuse_ticket_drop(tick); 406 407 return (err); 408 } 409 410 static inline int 411 fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio) 412 { 413 if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) { 414 SDT_PROBE2(fusefs, , device, trace, 1, 415 "Format error: body size " 416 "differs from size claimed by header"); 417 return (EXTERROR(EINVAL, "Format error: body size " 418 "differs from size claimed by header")); 419 } 420 if (uio->uio_resid && ohead->unique != 0 && ohead->error) { 421 SDT_PROBE2(fusefs, , device, trace, 1, 422 "Format error: non zero error but message had a body"); 423 return (EXTERROR(EINVAL, "Format error: non zero error, " 424 "but message had a body")); 425 } 426 427 return (0); 428 } 429 430 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_notify, 431 "struct fuse_out_header*"); 432 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_missing_ticket, 433 "uint64_t"); 434 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_found, 435 "struct fuse_ticket*"); 436 /* 437 * fuse_device_write first reads the header sent by the daemon. 438 * If that's OK, looks up ticket/callback node by the unique id seen in header. 439 * If the callback node contains a handler function, the uio is passed over 440 * that. 441 */ 442 static int 443 fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) 444 { 445 struct fuse_out_header ohead; 446 int err = 0; 447 struct fuse_data *data; 448 struct mount *mp; 449 struct fuse_ticket *tick, *itick, *x_tick; 450 int found = 0; 451 452 err = devfs_get_cdevpriv((void **)&data); 453 if (err != 0) 454 return (err); 455 456 if (uio->uio_resid < sizeof(struct fuse_out_header)) { 457 SDT_PROBE2(fusefs, , device, trace, 1, 458 "fuse_device_write got less than a header!"); 459 fdata_set_dead(data); 460 return (EXTERROR(EINVAL, "fuse_device_write got less than a header!")); 461 } 462 if ((err = uiomove(&ohead, sizeof(struct fuse_out_header), uio)) != 0) 463 return (err); 464 465 if (data->linux_errnos != 0 && ohead.error != 0) { 466 err = -ohead.error; 467 if (err < 0 || err >= nitems(linux_to_bsd_errtbl)) 468 return (EXTERROR(EINVAL, "Unknown Linux errno", err)); 469 470 /* '-', because it will get flipped again below */ 471 ohead.error = -linux_to_bsd_errtbl[err]; 472 } 473 474 /* 475 * We check header information (which is redundant) and compare it 476 * with what we see. If we see some inconsistency we discard the 477 * whole answer and proceed on as if it had never existed. In 478 * particular, no pretender will be woken up, regardless the 479 * "unique" value in the header. 480 */ 481 if ((err = fuse_ohead_audit(&ohead, uio))) { 482 fdata_set_dead(data); 483 return (err); 484 } 485 /* Pass stuff over to callback if there is one installed */ 486 487 /* Looking for ticket with the unique id of header */ 488 fuse_lck_mtx_lock(data->aw_mtx); 489 TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link, 490 x_tick) { 491 if (tick->tk_unique == ohead.unique) { 492 SDT_PROBE1(fusefs, , device, fuse_device_write_found, 493 tick); 494 found = 1; 495 fuse_aw_remove(tick); 496 break; 497 } 498 } 499 if (found && tick->irq_unique > 0) { 500 /* 501 * Discard the FUSE_INTERRUPT ticket that tried to interrupt 502 * this operation 503 */ 504 TAILQ_FOREACH_SAFE(itick, &data->aw_head, tk_aw_link, 505 x_tick) { 506 if (itick->tk_unique == tick->irq_unique) { 507 fuse_aw_remove(itick); 508 fuse_ticket_drop(itick); 509 break; 510 } 511 } 512 tick->irq_unique = 0; 513 } 514 fuse_lck_mtx_unlock(data->aw_mtx); 515 516 if (found) { 517 if (tick->tk_aw_handler) { 518 /* 519 * We found a callback with proper handler. In this 520 * case the out header will be 0wnd by the callback, 521 * so the fun of freeing that is left for her. 522 * (Then, by all chance, she'll just get that's done 523 * via ticket_drop(), so no manual mucking 524 * around...) 525 */ 526 SDT_PROBE2(fusefs, , device, trace, 1, 527 "pass ticket to a callback"); 528 /* Sanitize the linuxism of negative errnos */ 529 ohead.error *= -1; 530 if (ohead.error < 0 || ohead.error > ELAST) { 531 /* Illegal error code */ 532 ohead.error = EIO; 533 memcpy(&tick->tk_aw_ohead, &ohead, 534 sizeof(ohead)); 535 tick->tk_aw_handler(tick, uio); 536 err = EXTERROR(EINVAL, "Unknown errno", ohead.error); 537 } else { 538 memcpy(&tick->tk_aw_ohead, &ohead, 539 sizeof(ohead)); 540 err = tick->tk_aw_handler(tick, uio); 541 } 542 } else { 543 /* pretender doesn't wanna do anything with answer */ 544 SDT_PROBE2(fusefs, , device, trace, 1, 545 "stuff devalidated, so we drop it"); 546 } 547 548 /* 549 * As aw_mtx was not held during the callback execution the 550 * ticket may have been inserted again. However, this is safe 551 * because fuse_ticket_drop() will deal with refcount anyway. 552 */ 553 fuse_ticket_drop(tick); 554 } else if (ohead.unique == 0){ 555 /* unique == 0 means asynchronous notification */ 556 SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead); 557 if (data->mp == NULL) { 558 SDT_PROBE2(fusefs, , device, trace, 1, 559 "asynchronous notification before mount" 560 " or after unmount"); 561 return (EXTERROR(ENODEV, 562 "This FUSE session is not mounted")); 563 } 564 mp = data->mp; 565 vfs_ref(mp); 566 err = vfs_busy(mp, 0); 567 vfs_rel(mp); 568 if (err) 569 return (err); 570 571 switch (ohead.error) { 572 case FUSE_NOTIFY_INVAL_ENTRY: 573 err = fuse_internal_invalidate_entry(mp, uio); 574 break; 575 case FUSE_NOTIFY_INVAL_INODE: 576 err = fuse_internal_invalidate_inode(mp, uio); 577 break; 578 case FUSE_NOTIFY_RETRIEVE: 579 case FUSE_NOTIFY_STORE: 580 /* 581 * Unimplemented. I don't know of any file systems 582 * that use them, and the protocol isn't sound anyway, 583 * since the notification messages don't include the 584 * inode's generation number. Without that, it's 585 * possible to manipulate the cache of the wrong vnode. 586 * Finally, it's not defined what this message should 587 * do for a file with dirty cache. 588 */ 589 case FUSE_NOTIFY_POLL: 590 /* Unimplemented. See comments in fuse_vnops */ 591 default: 592 /* Not implemented */ 593 err = EXTERROR(ENOSYS, "Unimplemented FUSE notification code", 594 ohead.error); 595 } 596 vfs_unbusy(mp); 597 } else { 598 /* no callback at all! */ 599 SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket, 600 ohead.unique); 601 if (ohead.error == -EAGAIN) { 602 /* 603 * This was probably a response to a FUSE_INTERRUPT 604 * operation whose original operation is already 605 * complete. We can't store FUSE_INTERRUPT tickets 606 * indefinitely because their responses are optional. 607 * So we delete them when the original operation 608 * completes. And sadly the fuse_header_out doesn't 609 * identify the opcode, so we have to guess. 610 */ 611 err = 0; 612 } else { 613 err = EXTERROR(EINVAL, "FUSE ticket is missing"); 614 } 615 } 616 617 return (err); 618 } 619 620 int 621 fuse_device_init(void) 622 { 623 624 fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR, 625 S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, "fuse"); 626 if (fuse_dev == NULL) 627 return (ENOMEM); 628 return (0); 629 } 630 631 void 632 fuse_device_destroy(void) 633 { 634 635 MPASS(fuse_dev != NULL); 636 destroy_dev(fuse_dev); 637 } 638