1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2007-2009 Google Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above 14 * copyright notice, this list of conditions and the following disclaimer 15 * in the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Google Inc. nor the names of its 18 * contributors may be used to endorse or promote products derived from 19 * this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Copyright (C) 2005 Csaba Henk. 34 * All rights reserved. 35 * 36 * Copyright (c) 2019 The FreeBSD Foundation 37 * 38 * Portions of this software were developed by BFF Storage Systems, LLC under 39 * sponsorship from the FreeBSD Foundation. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 50 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 */ 62 63 #include <sys/types.h> 64 #include <sys/param.h> 65 #include <sys/module.h> 66 #include <sys/systm.h> 67 #include <sys/errno.h> 68 #include <sys/param.h> 69 #include <sys/kernel.h> 70 #include <sys/conf.h> 71 #include <sys/uio.h> 72 #include <sys/malloc.h> 73 #include <sys/queue.h> 74 #include <sys/lock.h> 75 #include <sys/sx.h> 76 #include <sys/mutex.h> 77 #include <sys/proc.h> 78 #include <sys/mount.h> 79 #include <sys/sdt.h> 80 #include <sys/stat.h> 81 #include <sys/fcntl.h> 82 #include <sys/sysctl.h> 83 #include <sys/poll.h> 84 #include <sys/selinfo.h> 85 #define EXTERR_CATEGORY EXTERR_CAT_FUSE 86 #include <sys/exterrvar.h> 87 88 #include "fuse.h" 89 #include "fuse_internal.h" 90 #include "fuse_ipc.h" 91 92 #include <compat/linux/linux_errno.h> 93 #include <compat/linux/linux_errno.inc> 94 95 SDT_PROVIDER_DECLARE(fusefs); 96 /* 97 * Fuse trace probe: 98 * arg0: verbosity. Higher numbers give more verbose messages 99 * arg1: Textual message 100 */ 101 SDT_PROBE_DEFINE2(fusefs, , device, trace, "int", "char*"); 102 103 static struct cdev *fuse_dev; 104 105 static d_kqfilter_t fuse_device_filter; 106 static d_open_t fuse_device_open; 107 static d_poll_t fuse_device_poll; 108 static d_read_t fuse_device_read; 109 static d_write_t fuse_device_write; 110 111 static struct cdevsw fuse_device_cdevsw = { 112 .d_kqfilter = fuse_device_filter, 113 .d_open = fuse_device_open, 114 .d_name = "fuse", 115 .d_poll = fuse_device_poll, 116 .d_read = fuse_device_read, 117 .d_write = fuse_device_write, 118 .d_version = D_VERSION, 119 }; 120 121 static int fuse_device_filt_read(struct knote *kn, long hint); 122 static int fuse_device_filt_write(struct knote *kn, long hint); 123 static void fuse_device_filt_detach(struct knote *kn); 124 125 static const struct filterops fuse_device_rfiltops = { 126 .f_isfd = 1, 127 .f_detach = fuse_device_filt_detach, 128 .f_event = fuse_device_filt_read, 129 .f_copy = knote_triv_copy, 130 }; 131 132 static const struct filterops fuse_device_wfiltops = { 133 .f_isfd = 1, 134 .f_event = fuse_device_filt_write, 135 .f_copy = knote_triv_copy, 136 }; 137 138 /**************************** 139 * 140 * >>> Fuse device op defs 141 * 142 ****************************/ 143 144 static void 145 fdata_dtor(void *arg) 146 { 147 struct fuse_data *fdata; 148 struct fuse_ticket *tick; 149 150 fdata = arg; 151 if (fdata == NULL) 152 return; 153 154 fdata_set_dead(fdata); 155 156 FUSE_LOCK(); 157 fuse_lck_mtx_lock(fdata->aw_mtx); 158 /* wakup poll()ers */ 159 selwakeuppri(&fdata->ks_rsel, PZERO); 160 /* Don't let syscall handlers wait in vain */ 161 while ((tick = fuse_aw_pop(fdata))) { 162 fuse_lck_mtx_lock(tick->tk_aw_mtx); 163 fticket_set_answered(tick); 164 tick->tk_aw_errno = ENOTCONN; 165 wakeup(tick); 166 fuse_lck_mtx_unlock(tick->tk_aw_mtx); 167 FUSE_ASSERT_AW_DONE(tick); 168 fuse_ticket_drop(tick); 169 } 170 fuse_lck_mtx_unlock(fdata->aw_mtx); 171 172 /* Cleanup unsent operations */ 173 fuse_lck_mtx_lock(fdata->ms_mtx); 174 while ((tick = fuse_ms_pop(fdata))) { 175 fuse_ticket_drop(tick); 176 } 177 fuse_lck_mtx_unlock(fdata->ms_mtx); 178 FUSE_UNLOCK(); 179 180 fdata_trydestroy(fdata); 181 } 182 183 static int 184 fuse_device_filter(struct cdev *dev, struct knote *kn) 185 { 186 struct fuse_data *data; 187 int error; 188 189 error = devfs_get_cdevpriv((void **)&data); 190 191 if (error == 0 && kn->kn_filter == EVFILT_READ) { 192 kn->kn_fop = &fuse_device_rfiltops; 193 kn->kn_hook = data; 194 knlist_add(&data->ks_rsel.si_note, kn, 0); 195 error = 0; 196 } else if (error == 0 && kn->kn_filter == EVFILT_WRITE) { 197 kn->kn_fop = &fuse_device_wfiltops; 198 error = 0; 199 } else if (error == 0) { 200 error = EXTERROR(EINVAL, "Unsupported kevent filter"); 201 kn->kn_data = error; 202 } 203 204 return (error); 205 } 206 207 static void 208 fuse_device_filt_detach(struct knote *kn) 209 { 210 struct fuse_data *data; 211 212 data = (struct fuse_data*)kn->kn_hook; 213 MPASS(data != NULL); 214 knlist_remove(&data->ks_rsel.si_note, kn, 0); 215 kn->kn_hook = NULL; 216 } 217 218 static int 219 fuse_device_filt_read(struct knote *kn, long hint) 220 { 221 struct fuse_data *data; 222 int ready; 223 224 data = (struct fuse_data*)kn->kn_hook; 225 MPASS(data != NULL); 226 227 mtx_assert(&data->ms_mtx, MA_OWNED); 228 if (fdata_get_dead(data)) { 229 kn->kn_flags |= EV_EOF; 230 kn->kn_fflags = ENODEV; 231 kn->kn_data = 1; 232 ready = 1; 233 } else if (STAILQ_FIRST(&data->ms_head)) { 234 MPASS(data->ms_count >= 1); 235 kn->kn_data = data->ms_count; 236 ready = 1; 237 } else { 238 ready = 0; 239 } 240 241 return (ready); 242 } 243 244 static int 245 fuse_device_filt_write(struct knote *kn, long hint) 246 { 247 248 kn->kn_data = 0; 249 250 /* The device is always ready to write, so we return 1*/ 251 return (1); 252 } 253 254 /* 255 * Resources are set up on a per-open basis 256 */ 257 static int 258 fuse_device_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 259 { 260 struct fuse_data *fdata; 261 int error; 262 263 SDT_PROBE2(fusefs, , device, trace, 1, "device open"); 264 265 fdata = fdata_alloc(dev, td->td_ucred); 266 error = devfs_set_cdevpriv(fdata, fdata_dtor); 267 if (error != 0) 268 fdata_trydestroy(fdata); 269 else 270 SDT_PROBE2(fusefs, , device, trace, 1, "device open success"); 271 return (error); 272 } 273 274 int 275 fuse_device_poll(struct cdev *dev, int events, struct thread *td) 276 { 277 struct fuse_data *data; 278 int error, revents = 0; 279 280 error = devfs_get_cdevpriv((void **)&data); 281 if (error != 0) 282 return (events & 283 (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); 284 285 if (events & (POLLIN | POLLRDNORM)) { 286 fuse_lck_mtx_lock(data->ms_mtx); 287 if (fdata_get_dead(data) || STAILQ_FIRST(&data->ms_head)) 288 revents |= events & (POLLIN | POLLRDNORM); 289 else 290 selrecord(td, &data->ks_rsel); 291 fuse_lck_mtx_unlock(data->ms_mtx); 292 } 293 if (events & (POLLOUT | POLLWRNORM)) { 294 revents |= events & (POLLOUT | POLLWRNORM); 295 } 296 return (revents); 297 } 298 299 /* 300 * fuse_device_read hangs on the queue of VFS messages. 301 * When it's notified that there is a new one, it picks that and 302 * passes up to the daemon 303 */ 304 int 305 fuse_device_read(struct cdev *dev, struct uio *uio, int ioflag) 306 { 307 int err; 308 struct fuse_data *data; 309 struct fuse_ticket *tick; 310 void *buf; 311 int buflen; 312 313 SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read"); 314 315 err = devfs_get_cdevpriv((void **)&data); 316 if (err != 0) 317 return (err); 318 319 fuse_lck_mtx_lock(data->ms_mtx); 320 again: 321 if (fdata_get_dead(data)) { 322 SDT_PROBE2(fusefs, , device, trace, 2, 323 "we know early on that reader should be kicked so we " 324 "don't wait for news"); 325 fuse_lck_mtx_unlock(data->ms_mtx); 326 return (EXTERROR(ENODEV, "This FUSE session is about to be closed")); 327 } 328 if (!(tick = fuse_ms_pop(data))) { 329 /* check if we may block */ 330 if (ioflag & O_NONBLOCK) { 331 /* get outa here soon */ 332 fuse_lck_mtx_unlock(data->ms_mtx); 333 return (EAGAIN); 334 } else { 335 err = msleep(data, &data->ms_mtx, PCATCH, "fu_msg", 0); 336 if (err != 0) { 337 fuse_lck_mtx_unlock(data->ms_mtx); 338 if (fdata_get_dead(data)) 339 err = EXTERROR(ENODEV, 340 "This FUSE session is about to be closed"); 341 return (err); 342 } 343 tick = fuse_ms_pop(data); 344 } 345 } 346 if (!tick) { 347 /* 348 * We can get here if fuse daemon suddenly terminates, 349 * eg, by being hit by a SIGKILL 350 * -- and some other cases, too, tho not totally clear, when 351 * (cv_signal/wakeup_one signals the whole process ?) 352 */ 353 SDT_PROBE2(fusefs, , device, trace, 1, "no message on thread"); 354 goto again; 355 } 356 fuse_lck_mtx_unlock(data->ms_mtx); 357 358 if (fdata_get_dead(data)) { 359 /* 360 * somebody somewhere -- eg., umount routine -- 361 * wants this liaison finished off 362 */ 363 SDT_PROBE2(fusefs, , device, trace, 2, 364 "reader is to be sacked"); 365 if (tick) { 366 SDT_PROBE2(fusefs, , device, trace, 2, "weird -- " 367 "\"kick\" is set tho there is message"); 368 FUSE_ASSERT_MS_DONE(tick); 369 fuse_ticket_drop(tick); 370 } 371 /* This should make the daemon get off of us */ 372 return (EXTERROR(ENODEV, "This FUSE session is about to be closed")); 373 } 374 SDT_PROBE2(fusefs, , device, trace, 1, 375 "fuse device read message successfully"); 376 377 buf = tick->tk_ms_fiov.base; 378 buflen = tick->tk_ms_fiov.len; 379 380 /* 381 * Why not ban mercilessly stupid daemons who can't keep up 382 * with us? (There is no much use of a partial read here...) 383 */ 384 /* 385 * XXX note that in such cases Linux FUSE throws EIO at the 386 * syscall invoker and stands back to the message queue. The 387 * rationale should be made clear (and possibly adopt that 388 * behaviour). Keeping the current scheme at least makes 389 * fallacy as loud as possible... 390 */ 391 if (uio->uio_resid < buflen) { 392 fdata_set_dead(data); 393 SDT_PROBE2(fusefs, , device, trace, 2, 394 "daemon is stupid, kick it off..."); 395 err = EXTERROR(ENODEV, "Partial read attempted"); 396 } else { 397 err = uiomove(buf, buflen, uio); 398 } 399 400 FUSE_ASSERT_MS_DONE(tick); 401 fuse_ticket_drop(tick); 402 403 return (err); 404 } 405 406 static inline int 407 fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio) 408 { 409 if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) { 410 SDT_PROBE2(fusefs, , device, trace, 1, 411 "Format error: body size " 412 "differs from size claimed by header"); 413 return (EXTERROR(EINVAL, "Format error: body size " 414 "differs from size claimed by header")); 415 } 416 if (uio->uio_resid && ohead->unique != 0 && ohead->error) { 417 SDT_PROBE2(fusefs, , device, trace, 1, 418 "Format error: non zero error but message had a body"); 419 return (EXTERROR(EINVAL, "Format error: non zero error, " 420 "but message had a body")); 421 } 422 423 return (0); 424 } 425 426 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_notify, 427 "struct fuse_out_header*"); 428 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_missing_ticket, 429 "uint64_t"); 430 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_found, 431 "struct fuse_ticket*"); 432 /* 433 * fuse_device_write first reads the header sent by the daemon. 434 * If that's OK, looks up ticket/callback node by the unique id seen in header. 435 * If the callback node contains a handler function, the uio is passed over 436 * that. 437 */ 438 static int 439 fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) 440 { 441 struct fuse_out_header ohead; 442 int err = 0; 443 struct fuse_data *data; 444 struct mount *mp; 445 struct fuse_ticket *tick, *itick, *x_tick; 446 int found = 0; 447 448 err = devfs_get_cdevpriv((void **)&data); 449 if (err != 0) 450 return (err); 451 452 if (uio->uio_resid < sizeof(struct fuse_out_header)) { 453 SDT_PROBE2(fusefs, , device, trace, 1, 454 "fuse_device_write got less than a header!"); 455 fdata_set_dead(data); 456 return (EXTERROR(EINVAL, "fuse_device_write got less than a header!")); 457 } 458 if ((err = uiomove(&ohead, sizeof(struct fuse_out_header), uio)) != 0) 459 return (err); 460 461 if (data->linux_errnos != 0 && ohead.error != 0) { 462 err = -ohead.error; 463 if (err < 0 || err >= nitems(linux_to_bsd_errtbl)) 464 return (EXTERROR(EINVAL, "Unknown Linux errno", err)); 465 466 /* '-', because it will get flipped again below */ 467 ohead.error = -linux_to_bsd_errtbl[err]; 468 } 469 470 /* 471 * We check header information (which is redundant) and compare it 472 * with what we see. If we see some inconsistency we discard the 473 * whole answer and proceed on as if it had never existed. In 474 * particular, no pretender will be woken up, regardless the 475 * "unique" value in the header. 476 */ 477 if ((err = fuse_ohead_audit(&ohead, uio))) { 478 fdata_set_dead(data); 479 return (err); 480 } 481 /* Pass stuff over to callback if there is one installed */ 482 483 /* Looking for ticket with the unique id of header */ 484 fuse_lck_mtx_lock(data->aw_mtx); 485 TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link, 486 x_tick) { 487 if (tick->tk_unique == ohead.unique) { 488 SDT_PROBE1(fusefs, , device, fuse_device_write_found, 489 tick); 490 found = 1; 491 fuse_aw_remove(tick); 492 break; 493 } 494 } 495 if (found && tick->irq_unique > 0) { 496 /* 497 * Discard the FUSE_INTERRUPT ticket that tried to interrupt 498 * this operation 499 */ 500 TAILQ_FOREACH_SAFE(itick, &data->aw_head, tk_aw_link, 501 x_tick) { 502 if (itick->tk_unique == tick->irq_unique) { 503 fuse_aw_remove(itick); 504 fuse_ticket_drop(itick); 505 break; 506 } 507 } 508 tick->irq_unique = 0; 509 } 510 fuse_lck_mtx_unlock(data->aw_mtx); 511 512 if (found) { 513 if (tick->tk_aw_handler) { 514 /* 515 * We found a callback with proper handler. In this 516 * case the out header will be 0wnd by the callback, 517 * so the fun of freeing that is left for her. 518 * (Then, by all chance, she'll just get that's done 519 * via ticket_drop(), so no manual mucking 520 * around...) 521 */ 522 SDT_PROBE2(fusefs, , device, trace, 1, 523 "pass ticket to a callback"); 524 /* Sanitize the linuxism of negative errnos */ 525 ohead.error *= -1; 526 if (ohead.error < 0 || ohead.error > ELAST) { 527 /* Illegal error code */ 528 ohead.error = EIO; 529 memcpy(&tick->tk_aw_ohead, &ohead, 530 sizeof(ohead)); 531 tick->tk_aw_handler(tick, uio); 532 err = EXTERROR(EINVAL, "Unknown errno", ohead.error); 533 } else { 534 memcpy(&tick->tk_aw_ohead, &ohead, 535 sizeof(ohead)); 536 err = tick->tk_aw_handler(tick, uio); 537 } 538 } else { 539 /* pretender doesn't wanna do anything with answer */ 540 SDT_PROBE2(fusefs, , device, trace, 1, 541 "stuff devalidated, so we drop it"); 542 } 543 544 /* 545 * As aw_mtx was not held during the callback execution the 546 * ticket may have been inserted again. However, this is safe 547 * because fuse_ticket_drop() will deal with refcount anyway. 548 */ 549 fuse_ticket_drop(tick); 550 } else if (ohead.unique == 0){ 551 /* unique == 0 means asynchronous notification */ 552 SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead); 553 if (data->mp == NULL) { 554 SDT_PROBE2(fusefs, , device, trace, 1, 555 "asynchronous notification before mount" 556 " or after unmount"); 557 return (EXTERROR(ENODEV, 558 "This FUSE session is not mounted")); 559 } 560 mp = data->mp; 561 vfs_ref(mp); 562 err = vfs_busy(mp, 0); 563 vfs_rel(mp); 564 if (err) 565 return (err); 566 567 switch (ohead.error) { 568 case FUSE_NOTIFY_INVAL_ENTRY: 569 err = fuse_internal_invalidate_entry(mp, uio); 570 break; 571 case FUSE_NOTIFY_INVAL_INODE: 572 err = fuse_internal_invalidate_inode(mp, uio); 573 break; 574 case FUSE_NOTIFY_RETRIEVE: 575 case FUSE_NOTIFY_STORE: 576 /* 577 * Unimplemented. I don't know of any file systems 578 * that use them, and the protocol isn't sound anyway, 579 * since the notification messages don't include the 580 * inode's generation number. Without that, it's 581 * possible to manipulate the cache of the wrong vnode. 582 * Finally, it's not defined what this message should 583 * do for a file with dirty cache. 584 */ 585 case FUSE_NOTIFY_POLL: 586 /* Unimplemented. See comments in fuse_vnops */ 587 default: 588 /* Not implemented */ 589 err = EXTERROR(ENOSYS, "Unimplemented FUSE notification code", 590 ohead.error); 591 } 592 vfs_unbusy(mp); 593 } else { 594 /* no callback at all! */ 595 SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket, 596 ohead.unique); 597 if (ohead.error == -EAGAIN) { 598 /* 599 * This was probably a response to a FUSE_INTERRUPT 600 * operation whose original operation is already 601 * complete. We can't store FUSE_INTERRUPT tickets 602 * indefinitely because their responses are optional. 603 * So we delete them when the original operation 604 * completes. And sadly the fuse_header_out doesn't 605 * identify the opcode, so we have to guess. 606 */ 607 err = 0; 608 } else { 609 err = EXTERROR(EINVAL, "FUSE ticket is missing"); 610 } 611 } 612 613 return (err); 614 } 615 616 int 617 fuse_device_init(void) 618 { 619 620 fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR, 621 S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, "fuse"); 622 if (fuse_dev == NULL) 623 return (ENOMEM); 624 return (0); 625 } 626 627 void 628 fuse_device_destroy(void) 629 { 630 631 MPASS(fuse_dev != NULL); 632 destroy_dev(fuse_dev); 633 } 634