1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2007-2009 Google Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above 14 * copyright notice, this list of conditions and the following disclaimer 15 * in the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Google Inc. nor the names of its 18 * contributors may be used to endorse or promote products derived from 19 * this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Copyright (C) 2005 Csaba Henk. 34 * All rights reserved. 35 * 36 * Copyright (c) 2019 The FreeBSD Foundation 37 * 38 * Portions of this software were developed by BFF Storage Systems, LLC under 39 * sponsorship from the FreeBSD Foundation. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 50 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 */ 62 63 #include <sys/cdefs.h> 64 #include <sys/types.h> 65 #include <sys/param.h> 66 #include <sys/module.h> 67 #include <sys/systm.h> 68 #include <sys/errno.h> 69 #include <sys/param.h> 70 #include <sys/kernel.h> 71 #include <sys/conf.h> 72 #include <sys/uio.h> 73 #include <sys/malloc.h> 74 #include <sys/queue.h> 75 #include <sys/lock.h> 76 #include <sys/sx.h> 77 #include <sys/mutex.h> 78 #include <sys/proc.h> 79 #include <sys/mount.h> 80 #include <sys/sdt.h> 81 #include <sys/stat.h> 82 #include <sys/fcntl.h> 83 #include <sys/sysctl.h> 84 #include <sys/poll.h> 85 #include <sys/selinfo.h> 86 87 #include "fuse.h" 88 #include "fuse_internal.h" 89 #include "fuse_ipc.h" 90 91 #include <compat/linux/linux_errno.h> 92 #include <compat/linux/linux_errno.inc> 93 94 SDT_PROVIDER_DECLARE(fusefs); 95 /* 96 * Fuse trace probe: 97 * arg0: verbosity. Higher numbers give more verbose messages 98 * arg1: Textual message 99 */ 100 SDT_PROBE_DEFINE2(fusefs, , device, trace, "int", "char*"); 101 102 static struct cdev *fuse_dev; 103 104 static d_kqfilter_t fuse_device_filter; 105 static d_open_t fuse_device_open; 106 static d_poll_t fuse_device_poll; 107 static d_read_t fuse_device_read; 108 static d_write_t fuse_device_write; 109 110 static struct cdevsw fuse_device_cdevsw = { 111 .d_kqfilter = fuse_device_filter, 112 .d_open = fuse_device_open, 113 .d_name = "fuse", 114 .d_poll = fuse_device_poll, 115 .d_read = fuse_device_read, 116 .d_write = fuse_device_write, 117 .d_version = D_VERSION, 118 }; 119 120 static int fuse_device_filt_read(struct knote *kn, long hint); 121 static int fuse_device_filt_write(struct knote *kn, long hint); 122 static void fuse_device_filt_detach(struct knote *kn); 123 124 struct filterops fuse_device_rfiltops = { 125 .f_isfd = 1, 126 .f_detach = fuse_device_filt_detach, 127 .f_event = fuse_device_filt_read, 128 }; 129 130 struct filterops fuse_device_wfiltops = { 131 .f_isfd = 1, 132 .f_event = fuse_device_filt_write, 133 }; 134 135 /**************************** 136 * 137 * >>> Fuse device op defs 138 * 139 ****************************/ 140 141 static void 142 fdata_dtor(void *arg) 143 { 144 struct fuse_data *fdata; 145 struct fuse_ticket *tick; 146 147 fdata = arg; 148 if (fdata == NULL) 149 return; 150 151 fdata_set_dead(fdata); 152 153 FUSE_LOCK(); 154 fuse_lck_mtx_lock(fdata->aw_mtx); 155 /* wakup poll()ers */ 156 selwakeuppri(&fdata->ks_rsel, PZERO + 1); 157 /* Don't let syscall handlers wait in vain */ 158 while ((tick = fuse_aw_pop(fdata))) { 159 fuse_lck_mtx_lock(tick->tk_aw_mtx); 160 fticket_set_answered(tick); 161 tick->tk_aw_errno = ENOTCONN; 162 wakeup(tick); 163 fuse_lck_mtx_unlock(tick->tk_aw_mtx); 164 FUSE_ASSERT_AW_DONE(tick); 165 fuse_ticket_drop(tick); 166 } 167 fuse_lck_mtx_unlock(fdata->aw_mtx); 168 169 /* Cleanup unsent operations */ 170 fuse_lck_mtx_lock(fdata->ms_mtx); 171 while ((tick = fuse_ms_pop(fdata))) { 172 fuse_ticket_drop(tick); 173 } 174 fuse_lck_mtx_unlock(fdata->ms_mtx); 175 FUSE_UNLOCK(); 176 177 fdata_trydestroy(fdata); 178 } 179 180 static int 181 fuse_device_filter(struct cdev *dev, struct knote *kn) 182 { 183 struct fuse_data *data; 184 int error; 185 186 error = devfs_get_cdevpriv((void **)&data); 187 188 if (error == 0 && kn->kn_filter == EVFILT_READ) { 189 kn->kn_fop = &fuse_device_rfiltops; 190 kn->kn_hook = data; 191 knlist_add(&data->ks_rsel.si_note, kn, 0); 192 error = 0; 193 } else if (error == 0 && kn->kn_filter == EVFILT_WRITE) { 194 kn->kn_fop = &fuse_device_wfiltops; 195 error = 0; 196 } else if (error == 0) { 197 error = EINVAL; 198 kn->kn_data = error; 199 } 200 201 return (error); 202 } 203 204 static void 205 fuse_device_filt_detach(struct knote *kn) 206 { 207 struct fuse_data *data; 208 209 data = (struct fuse_data*)kn->kn_hook; 210 MPASS(data != NULL); 211 knlist_remove(&data->ks_rsel.si_note, kn, 0); 212 kn->kn_hook = NULL; 213 } 214 215 static int 216 fuse_device_filt_read(struct knote *kn, long hint) 217 { 218 struct fuse_data *data; 219 int ready; 220 221 data = (struct fuse_data*)kn->kn_hook; 222 MPASS(data != NULL); 223 224 mtx_assert(&data->ms_mtx, MA_OWNED); 225 if (fdata_get_dead(data)) { 226 kn->kn_flags |= EV_EOF; 227 kn->kn_fflags = ENODEV; 228 kn->kn_data = 1; 229 ready = 1; 230 } else if (STAILQ_FIRST(&data->ms_head)) { 231 MPASS(data->ms_count >= 1); 232 kn->kn_data = data->ms_count; 233 ready = 1; 234 } else { 235 ready = 0; 236 } 237 238 return (ready); 239 } 240 241 static int 242 fuse_device_filt_write(struct knote *kn, long hint) 243 { 244 245 kn->kn_data = 0; 246 247 /* The device is always ready to write, so we return 1*/ 248 return (1); 249 } 250 251 /* 252 * Resources are set up on a per-open basis 253 */ 254 static int 255 fuse_device_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 256 { 257 struct fuse_data *fdata; 258 int error; 259 260 SDT_PROBE2(fusefs, , device, trace, 1, "device open"); 261 262 fdata = fdata_alloc(dev, td->td_ucred); 263 error = devfs_set_cdevpriv(fdata, fdata_dtor); 264 if (error != 0) 265 fdata_trydestroy(fdata); 266 else 267 SDT_PROBE2(fusefs, , device, trace, 1, "device open success"); 268 return (error); 269 } 270 271 int 272 fuse_device_poll(struct cdev *dev, int events, struct thread *td) 273 { 274 struct fuse_data *data; 275 int error, revents = 0; 276 277 error = devfs_get_cdevpriv((void **)&data); 278 if (error != 0) 279 return (events & 280 (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); 281 282 if (events & (POLLIN | POLLRDNORM)) { 283 fuse_lck_mtx_lock(data->ms_mtx); 284 if (fdata_get_dead(data) || STAILQ_FIRST(&data->ms_head)) 285 revents |= events & (POLLIN | POLLRDNORM); 286 else 287 selrecord(td, &data->ks_rsel); 288 fuse_lck_mtx_unlock(data->ms_mtx); 289 } 290 if (events & (POLLOUT | POLLWRNORM)) { 291 revents |= events & (POLLOUT | POLLWRNORM); 292 } 293 return (revents); 294 } 295 296 /* 297 * fuse_device_read hangs on the queue of VFS messages. 298 * When it's notified that there is a new one, it picks that and 299 * passes up to the daemon 300 */ 301 int 302 fuse_device_read(struct cdev *dev, struct uio *uio, int ioflag) 303 { 304 int err; 305 struct fuse_data *data; 306 struct fuse_ticket *tick; 307 void *buf; 308 int buflen; 309 310 SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read"); 311 312 err = devfs_get_cdevpriv((void **)&data); 313 if (err != 0) 314 return (err); 315 316 fuse_lck_mtx_lock(data->ms_mtx); 317 again: 318 if (fdata_get_dead(data)) { 319 SDT_PROBE2(fusefs, , device, trace, 2, 320 "we know early on that reader should be kicked so we " 321 "don't wait for news"); 322 fuse_lck_mtx_unlock(data->ms_mtx); 323 return (ENODEV); 324 } 325 if (!(tick = fuse_ms_pop(data))) { 326 /* check if we may block */ 327 if (ioflag & O_NONBLOCK) { 328 /* get outa here soon */ 329 fuse_lck_mtx_unlock(data->ms_mtx); 330 return (EAGAIN); 331 } else { 332 err = msleep(data, &data->ms_mtx, PCATCH, "fu_msg", 0); 333 if (err != 0) { 334 fuse_lck_mtx_unlock(data->ms_mtx); 335 return (fdata_get_dead(data) ? ENODEV : err); 336 } 337 tick = fuse_ms_pop(data); 338 } 339 } 340 if (!tick) { 341 /* 342 * We can get here if fuse daemon suddenly terminates, 343 * eg, by being hit by a SIGKILL 344 * -- and some other cases, too, tho not totally clear, when 345 * (cv_signal/wakeup_one signals the whole process ?) 346 */ 347 SDT_PROBE2(fusefs, , device, trace, 1, "no message on thread"); 348 goto again; 349 } 350 fuse_lck_mtx_unlock(data->ms_mtx); 351 352 if (fdata_get_dead(data)) { 353 /* 354 * somebody somewhere -- eg., umount routine -- 355 * wants this liaison finished off 356 */ 357 SDT_PROBE2(fusefs, , device, trace, 2, 358 "reader is to be sacked"); 359 if (tick) { 360 SDT_PROBE2(fusefs, , device, trace, 2, "weird -- " 361 "\"kick\" is set tho there is message"); 362 FUSE_ASSERT_MS_DONE(tick); 363 fuse_ticket_drop(tick); 364 } 365 return (ENODEV); /* This should make the daemon get off 366 * of us */ 367 } 368 SDT_PROBE2(fusefs, , device, trace, 1, 369 "fuse device read message successfully"); 370 371 buf = tick->tk_ms_fiov.base; 372 buflen = tick->tk_ms_fiov.len; 373 374 /* 375 * Why not ban mercilessly stupid daemons who can't keep up 376 * with us? (There is no much use of a partial read here...) 377 */ 378 /* 379 * XXX note that in such cases Linux FUSE throws EIO at the 380 * syscall invoker and stands back to the message queue. The 381 * rationale should be made clear (and possibly adopt that 382 * behaviour). Keeping the current scheme at least makes 383 * fallacy as loud as possible... 384 */ 385 if (uio->uio_resid < buflen) { 386 fdata_set_dead(data); 387 SDT_PROBE2(fusefs, , device, trace, 2, 388 "daemon is stupid, kick it off..."); 389 err = ENODEV; 390 } else { 391 err = uiomove(buf, buflen, uio); 392 } 393 394 FUSE_ASSERT_MS_DONE(tick); 395 fuse_ticket_drop(tick); 396 397 return (err); 398 } 399 400 static inline int 401 fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio) 402 { 403 if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) { 404 SDT_PROBE2(fusefs, , device, trace, 1, 405 "Format error: body size " 406 "differs from size claimed by header"); 407 return (EINVAL); 408 } 409 if (uio->uio_resid && ohead->unique != 0 && ohead->error) { 410 SDT_PROBE2(fusefs, , device, trace, 1, 411 "Format error: non zero error but message had a body"); 412 return (EINVAL); 413 } 414 415 return (0); 416 } 417 418 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_notify, 419 "struct fuse_out_header*"); 420 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_missing_ticket, 421 "uint64_t"); 422 SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_found, 423 "struct fuse_ticket*"); 424 /* 425 * fuse_device_write first reads the header sent by the daemon. 426 * If that's OK, looks up ticket/callback node by the unique id seen in header. 427 * If the callback node contains a handler function, the uio is passed over 428 * that. 429 */ 430 static int 431 fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) 432 { 433 struct fuse_out_header ohead; 434 int err = 0; 435 struct fuse_data *data; 436 struct mount *mp; 437 struct fuse_ticket *tick, *itick, *x_tick; 438 int found = 0; 439 440 err = devfs_get_cdevpriv((void **)&data); 441 if (err != 0) 442 return (err); 443 mp = data->mp; 444 445 if (uio->uio_resid < sizeof(struct fuse_out_header)) { 446 SDT_PROBE2(fusefs, , device, trace, 1, 447 "fuse_device_write got less than a header!"); 448 fdata_set_dead(data); 449 return (EINVAL); 450 } 451 if ((err = uiomove(&ohead, sizeof(struct fuse_out_header), uio)) != 0) 452 return (err); 453 454 if (data->linux_errnos != 0 && ohead.error != 0) { 455 err = -ohead.error; 456 if (err < 0 || err >= nitems(linux_to_bsd_errtbl)) 457 return (EINVAL); 458 459 /* '-', because it will get flipped again below */ 460 ohead.error = -linux_to_bsd_errtbl[err]; 461 } 462 463 /* 464 * We check header information (which is redundant) and compare it 465 * with what we see. If we see some inconsistency we discard the 466 * whole answer and proceed on as if it had never existed. In 467 * particular, no pretender will be woken up, regardless the 468 * "unique" value in the header. 469 */ 470 if ((err = fuse_ohead_audit(&ohead, uio))) { 471 fdata_set_dead(data); 472 return (err); 473 } 474 /* Pass stuff over to callback if there is one installed */ 475 476 /* Looking for ticket with the unique id of header */ 477 fuse_lck_mtx_lock(data->aw_mtx); 478 TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link, 479 x_tick) { 480 if (tick->tk_unique == ohead.unique) { 481 SDT_PROBE1(fusefs, , device, fuse_device_write_found, 482 tick); 483 found = 1; 484 fuse_aw_remove(tick); 485 break; 486 } 487 } 488 if (found && tick->irq_unique > 0) { 489 /* 490 * Discard the FUSE_INTERRUPT ticket that tried to interrupt 491 * this operation 492 */ 493 TAILQ_FOREACH_SAFE(itick, &data->aw_head, tk_aw_link, 494 x_tick) { 495 if (itick->tk_unique == tick->irq_unique) { 496 fuse_aw_remove(itick); 497 fuse_ticket_drop(itick); 498 break; 499 } 500 } 501 tick->irq_unique = 0; 502 } 503 fuse_lck_mtx_unlock(data->aw_mtx); 504 505 if (found) { 506 if (tick->tk_aw_handler) { 507 /* 508 * We found a callback with proper handler. In this 509 * case the out header will be 0wnd by the callback, 510 * so the fun of freeing that is left for her. 511 * (Then, by all chance, she'll just get that's done 512 * via ticket_drop(), so no manual mucking 513 * around...) 514 */ 515 SDT_PROBE2(fusefs, , device, trace, 1, 516 "pass ticket to a callback"); 517 /* Sanitize the linuxism of negative errnos */ 518 ohead.error *= -1; 519 if (ohead.error < 0 || ohead.error > ELAST) { 520 /* Illegal error code */ 521 ohead.error = EIO; 522 memcpy(&tick->tk_aw_ohead, &ohead, 523 sizeof(ohead)); 524 tick->tk_aw_handler(tick, uio); 525 err = EINVAL; 526 } else { 527 memcpy(&tick->tk_aw_ohead, &ohead, 528 sizeof(ohead)); 529 err = tick->tk_aw_handler(tick, uio); 530 } 531 } else { 532 /* pretender doesn't wanna do anything with answer */ 533 SDT_PROBE2(fusefs, , device, trace, 1, 534 "stuff devalidated, so we drop it"); 535 } 536 537 /* 538 * As aw_mtx was not held during the callback execution the 539 * ticket may have been inserted again. However, this is safe 540 * because fuse_ticket_drop() will deal with refcount anyway. 541 */ 542 fuse_ticket_drop(tick); 543 } else if (ohead.unique == 0){ 544 /* unique == 0 means asynchronous notification */ 545 SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead); 546 switch (ohead.error) { 547 case FUSE_NOTIFY_INVAL_ENTRY: 548 err = fuse_internal_invalidate_entry(mp, uio); 549 break; 550 case FUSE_NOTIFY_INVAL_INODE: 551 err = fuse_internal_invalidate_inode(mp, uio); 552 break; 553 case FUSE_NOTIFY_RETRIEVE: 554 case FUSE_NOTIFY_STORE: 555 /* 556 * Unimplemented. I don't know of any file systems 557 * that use them, and the protocol isn't sound anyway, 558 * since the notification messages don't include the 559 * inode's generation number. Without that, it's 560 * possible to manipulate the cache of the wrong vnode. 561 * Finally, it's not defined what this message should 562 * do for a file with dirty cache. 563 */ 564 case FUSE_NOTIFY_POLL: 565 /* Unimplemented. See comments in fuse_vnops */ 566 default: 567 /* Not implemented */ 568 err = ENOSYS; 569 } 570 } else { 571 /* no callback at all! */ 572 SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket, 573 ohead.unique); 574 if (ohead.error == -EAGAIN) { 575 /* 576 * This was probably a response to a FUSE_INTERRUPT 577 * operation whose original operation is already 578 * complete. We can't store FUSE_INTERRUPT tickets 579 * indefinitely because their responses are optional. 580 * So we delete them when the original operation 581 * completes. And sadly the fuse_header_out doesn't 582 * identify the opcode, so we have to guess. 583 */ 584 err = 0; 585 } else { 586 err = EINVAL; 587 } 588 } 589 590 return (err); 591 } 592 593 int 594 fuse_device_init(void) 595 { 596 597 fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR, 598 S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, "fuse"); 599 if (fuse_dev == NULL) 600 return (ENOMEM); 601 return (0); 602 } 603 604 void 605 fuse_device_destroy(void) 606 { 607 608 MPASS(fuse_dev != NULL); 609 destroy_dev(fuse_dev); 610 } 611