1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2007-2009 Google Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above 14 * copyright notice, this list of conditions and the following disclaimer 15 * in the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Google Inc. nor the names of its 18 * contributors may be used to endorse or promote products derived from 19 * this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Copyright (C) 2005 Csaba Henk. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 45 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 */ 57 58 #include <sys/cdefs.h> 59 __FBSDID("$FreeBSD$"); 60 61 #include <sys/types.h> 62 #include <sys/module.h> 63 #include <sys/systm.h> 64 #include <sys/errno.h> 65 #include <sys/param.h> 66 #include <sys/kernel.h> 67 #include <sys/conf.h> 68 #include <sys/uio.h> 69 #include <sys/malloc.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/sx.h> 73 #include <sys/mutex.h> 74 #include <sys/rwlock.h> 75 #include <sys/proc.h> 76 #include <sys/mount.h> 77 #include <sys/vnode.h> 78 #include <sys/stat.h> 79 #include <sys/unistd.h> 80 #include <sys/filedesc.h> 81 #include <sys/file.h> 82 #include <sys/fcntl.h> 83 #include <sys/bio.h> 84 #include <sys/buf.h> 85 #include <sys/sysctl.h> 86 87 #include <vm/vm.h> 88 #include <vm/vm_extern.h> 89 #include <vm/pmap.h> 90 #include <vm/vm_map.h> 91 #include <vm/vm_page.h> 92 #include <vm/vm_object.h> 93 94 #include "fuse.h" 95 #include "fuse_file.h" 96 #include "fuse_node.h" 97 #include "fuse_internal.h" 98 #include "fuse_ipc.h" 99 #include "fuse_io.h" 100 101 SDT_PROVIDER_DECLARE(fuse); 102 /* 103 * Fuse trace probe: 104 * arg0: verbosity. Higher numbers give more verbose messages 105 * arg1: Textual message 106 */ 107 SDT_PROBE_DEFINE2(fuse, , io, trace, "int", "char*"); 108 109 static int 110 fuse_read_directbackend(struct vnode *vp, struct uio *uio, 111 struct ucred *cred, struct fuse_filehandle *fufh); 112 static int 113 fuse_read_biobackend(struct vnode *vp, struct uio *uio, 114 struct ucred *cred, struct fuse_filehandle *fufh); 115 static int 116 fuse_write_directbackend(struct vnode *vp, struct uio *uio, 117 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); 118 static int 119 fuse_write_biobackend(struct vnode *vp, struct uio *uio, 120 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); 121 122 SDT_PROBE_DEFINE5(fuse, , io, io_dispatch, "struct vnode*", "struct uio*", 123 "int", "struct ucred*", "struct fuse_filehandle*"); 124 int 125 fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, 126 struct ucred *cred) 127 { 128 struct fuse_filehandle *fufh; 129 int err, directio; 130 131 MPASS(vp->v_type == VREG || vp->v_type == VDIR); 132 133 err = fuse_filehandle_getrw(vp, 134 (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); 135 if (err) { 136 printf("FUSE: io dispatch: filehandles are closed\n"); 137 return err; 138 } 139 SDT_PROBE5(fuse, , io, io_dispatch, vp, uio, ioflag, cred, fufh); 140 141 /* 142 * Ideally, when the daemon asks for direct io at open time, the 143 * standard file flag should be set according to this, so that would 144 * just change the default mode, which later on could be changed via 145 * fcntl(2). 146 * But this doesn't work, the O_DIRECT flag gets cleared at some point 147 * (don't know where). So to make any use of the Fuse direct_io option, 148 * we hardwire it into the file's private data (similarly to Linux, 149 * btw.). 150 */ 151 directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); 152 153 switch (uio->uio_rw) { 154 case UIO_READ: 155 if (directio) { 156 SDT_PROBE2(fuse, , io, trace, 1, 157 "direct read of vnode"); 158 err = fuse_read_directbackend(vp, uio, cred, fufh); 159 } else { 160 SDT_PROBE2(fuse, , io, trace, 1, 161 "buffered read of vnode"); 162 err = fuse_read_biobackend(vp, uio, cred, fufh); 163 } 164 break; 165 case UIO_WRITE: 166 /* 167 * Kludge: simulate write-through caching via write-around 168 * caching. Same effect, as far as never caching dirty data, 169 * but slightly pessimal in that newly written data is not 170 * cached. 171 */ 172 if (directio || fuse_data_cache_mode == FUSE_CACHE_WT) { 173 SDT_PROBE2(fuse, , io, trace, 1, 174 "direct write of vnode"); 175 err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag); 176 } else { 177 SDT_PROBE2(fuse, , io, trace, 1, 178 "buffered write of vnode"); 179 err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag); 180 } 181 break; 182 default: 183 panic("uninterpreted mode passed to fuse_io_dispatch"); 184 } 185 186 return (err); 187 } 188 189 SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_start, "int", "int", "int"); 190 SDT_PROBE_DEFINE2(fuse, , io, read_bio_backend_feed, "int", "int"); 191 SDT_PROBE_DEFINE3(fuse, , io, read_bio_backend_end, "int", "ssize_t", "int"); 192 static int 193 fuse_read_biobackend(struct vnode *vp, struct uio *uio, 194 struct ucred *cred, struct fuse_filehandle *fufh) 195 { 196 struct buf *bp; 197 daddr_t lbn; 198 int bcount; 199 int err = 0, n = 0, on = 0; 200 off_t filesize; 201 202 const int biosize = fuse_iosize(vp); 203 204 if (uio->uio_resid == 0) 205 return (0); 206 if (uio->uio_offset < 0) 207 return (EINVAL); 208 209 bcount = biosize; 210 filesize = VTOFUD(vp)->filesize; 211 212 do { 213 if (fuse_isdeadfs(vp)) { 214 err = ENXIO; 215 break; 216 } 217 lbn = uio->uio_offset / biosize; 218 on = uio->uio_offset & (biosize - 1); 219 220 SDT_PROBE3(fuse, , io, read_bio_backend_start, 221 biosize, (int)lbn, on); 222 223 /* 224 * Obtain the buffer cache block. Figure out the buffer size 225 * when we are at EOF. If we are modifying the size of the 226 * buffer based on an EOF condition we need to hold 227 * nfs_rslock() through obtaining the buffer to prevent 228 * a potential writer-appender from messing with n_size. 229 * Otherwise we may accidentally truncate the buffer and 230 * lose dirty data. 231 * 232 * Note that bcount is *not* DEV_BSIZE aligned. 233 */ 234 if ((off_t)lbn * biosize >= filesize) { 235 bcount = 0; 236 } else if ((off_t)(lbn + 1) * biosize > filesize) { 237 bcount = filesize - (off_t)lbn *biosize; 238 } 239 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 240 241 if (!bp) 242 return (EINTR); 243 244 /* 245 * If B_CACHE is not set, we must issue the read. If this 246 * fails, we return an error. 247 */ 248 249 if ((bp->b_flags & B_CACHE) == 0) { 250 bp->b_iocmd = BIO_READ; 251 vfs_busy_pages(bp, 0); 252 err = fuse_io_strategy(vp, bp); 253 if (err) { 254 brelse(bp); 255 return (err); 256 } 257 } 258 /* 259 * on is the offset into the current bp. Figure out how many 260 * bytes we can copy out of the bp. Note that bcount is 261 * NOT DEV_BSIZE aligned. 262 * 263 * Then figure out how many bytes we can copy into the uio. 264 */ 265 266 n = 0; 267 if (on < bcount) 268 n = MIN((unsigned)(bcount - on), uio->uio_resid); 269 if (n > 0) { 270 SDT_PROBE2(fuse, , io, read_bio_backend_feed, 271 n, n + (int)bp->b_resid); 272 err = uiomove(bp->b_data + on, n, uio); 273 } 274 brelse(bp); 275 SDT_PROBE3(fuse, , io, read_bio_backend_end, err, 276 uio->uio_resid, n); 277 } while (err == 0 && uio->uio_resid > 0 && n > 0); 278 279 return (err); 280 } 281 282 SDT_PROBE_DEFINE1(fuse, , io, read_directbackend_start, "struct fuse_read_in*"); 283 SDT_PROBE_DEFINE2(fuse, , io, read_directbackend_complete, 284 "struct fuse_dispatcher*", "struct uio*"); 285 286 static int 287 fuse_read_directbackend(struct vnode *vp, struct uio *uio, 288 struct ucred *cred, struct fuse_filehandle *fufh) 289 { 290 struct fuse_dispatcher fdi; 291 struct fuse_read_in *fri; 292 int err = 0; 293 294 if (uio->uio_resid == 0) 295 return (0); 296 297 fdisp_init(&fdi, 0); 298 299 /* 300 * XXX In "normal" case we use an intermediate kernel buffer for 301 * transmitting data from daemon's context to ours. Eventually, we should 302 * get rid of this. Anyway, if the target uio lives in sysspace (we are 303 * called from pageops), and the input data doesn't need kernel-side 304 * processing (we are not called from readdir) we can already invoke 305 * an optimized, "peer-to-peer" I/O routine. 306 */ 307 while (uio->uio_resid > 0) { 308 fdi.iosize = sizeof(*fri); 309 fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); 310 fri = fdi.indata; 311 fri->fh = fufh->fh_id; 312 fri->offset = uio->uio_offset; 313 fri->size = MIN(uio->uio_resid, 314 fuse_get_mpdata(vp->v_mount)->max_read); 315 316 SDT_PROBE1(fuse, , io, read_directbackend_start, fri); 317 318 if ((err = fdisp_wait_answ(&fdi))) 319 goto out; 320 321 SDT_PROBE2(fuse, , io, read_directbackend_complete, 322 fdi.iosize, uio); 323 324 if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) 325 break; 326 if (fdi.iosize < fri->size) 327 break; 328 } 329 330 out: 331 fdisp_destroy(&fdi); 332 return (err); 333 } 334 335 static int 336 fuse_write_directbackend(struct vnode *vp, struct uio *uio, 337 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) 338 { 339 struct fuse_vnode_data *fvdat = VTOFUD(vp); 340 struct fuse_write_in *fwi; 341 struct fuse_dispatcher fdi; 342 size_t chunksize; 343 int diff; 344 int err = 0; 345 346 if (uio->uio_resid == 0) 347 return (0); 348 if (ioflag & IO_APPEND) 349 uio_setoffset(uio, fvdat->filesize); 350 351 fdisp_init(&fdi, 0); 352 353 while (uio->uio_resid > 0) { 354 chunksize = MIN(uio->uio_resid, 355 fuse_get_mpdata(vp->v_mount)->max_write); 356 357 fdi.iosize = sizeof(*fwi) + chunksize; 358 fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); 359 360 fwi = fdi.indata; 361 fwi->fh = fufh->fh_id; 362 fwi->offset = uio->uio_offset; 363 fwi->size = chunksize; 364 365 if ((err = uiomove((char *)fdi.indata + sizeof(*fwi), 366 chunksize, uio))) 367 break; 368 369 if ((err = fdisp_wait_answ(&fdi))) 370 break; 371 372 /* Adjust the uio in the case of short writes */ 373 diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size; 374 if (diff < 0) { 375 err = EINVAL; 376 break; 377 } else if (diff > 0 && !(ioflag & IO_DIRECT)) { 378 /* 379 * XXX We really should be directly checking whether 380 * the file was opened with FOPEN_DIRECT_IO, not 381 * IO_DIRECT. IO_DIRECT can be set in multiple ways. 382 */ 383 SDT_PROBE2(fuse, , io, trace, 1, 384 "misbehaving filesystem: short writes are only " 385 "allowed with direct_io"); 386 } 387 uio->uio_resid += diff; 388 uio->uio_offset -= diff; 389 390 if (uio->uio_offset > fvdat->filesize && 391 fuse_data_cache_mode != FUSE_CACHE_UC) { 392 fuse_vnode_setsize(vp, uio->uio_offset); 393 fvdat->flag &= ~FN_SIZECHANGE; 394 } 395 } 396 397 fdisp_destroy(&fdi); 398 399 return (err); 400 } 401 402 SDT_PROBE_DEFINE6(fuse, , io, write_biobackend_start, "int64_t", "int", "int", 403 "struct uio*", "int", "bool"); 404 SDT_PROBE_DEFINE2(fuse, , io, write_biobackend_append_race, "long", "int"); 405 406 static int 407 fuse_write_biobackend(struct vnode *vp, struct uio *uio, 408 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) 409 { 410 struct fuse_vnode_data *fvdat = VTOFUD(vp); 411 struct buf *bp; 412 daddr_t lbn; 413 int bcount; 414 int n, on, err = 0; 415 416 const int biosize = fuse_iosize(vp); 417 418 KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); 419 if (vp->v_type != VREG) 420 return (EIO); 421 if (uio->uio_offset < 0) 422 return (EINVAL); 423 if (uio->uio_resid == 0) 424 return (0); 425 if (ioflag & IO_APPEND) 426 uio_setoffset(uio, fvdat->filesize); 427 428 /* 429 * Find all of this file's B_NEEDCOMMIT buffers. If our writes 430 * would exceed the local maximum per-file write commit size when 431 * combined with those, we must decide whether to flush, 432 * go synchronous, or return err. We don't bother checking 433 * IO_UNIT -- we just make all writes atomic anyway, as there's 434 * no point optimizing for something that really won't ever happen. 435 */ 436 do { 437 if (fuse_isdeadfs(vp)) { 438 err = ENXIO; 439 break; 440 } 441 lbn = uio->uio_offset / biosize; 442 on = uio->uio_offset & (biosize - 1); 443 n = MIN((unsigned)(biosize - on), uio->uio_resid); 444 445 again: 446 /* 447 * Handle direct append and file extension cases, calculate 448 * unaligned buffer size. 449 */ 450 if (uio->uio_offset == fvdat->filesize && n) { 451 /* 452 * Get the buffer (in its pre-append state to maintain 453 * B_CACHE if it was previously set). Resize the 454 * nfsnode after we have locked the buffer to prevent 455 * readers from reading garbage. 456 */ 457 bcount = on; 458 SDT_PROBE6(fuse, , io, write_biobackend_start, 459 lbn, on, n, uio, bcount, true); 460 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 461 462 if (bp != NULL) { 463 long save; 464 465 err = fuse_vnode_setsize(vp, 466 uio->uio_offset + n); 467 if (err) { 468 brelse(bp); 469 break; 470 } 471 save = bp->b_flags & B_CACHE; 472 bcount += n; 473 allocbuf(bp, bcount); 474 bp->b_flags |= save; 475 } 476 } else { 477 /* 478 * Obtain the locked cache block first, and then 479 * adjust the file's size as appropriate. 480 */ 481 bcount = on + n; 482 if ((off_t)lbn * biosize + bcount < fvdat->filesize) { 483 if ((off_t)(lbn + 1) * biosize < fvdat->filesize) 484 bcount = biosize; 485 else 486 bcount = fvdat->filesize - 487 (off_t)lbn *biosize; 488 } 489 SDT_PROBE6(fuse, , io, write_biobackend_start, 490 lbn, on, n, uio, bcount, false); 491 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 492 if (bp && uio->uio_offset + n > fvdat->filesize) { 493 err = fuse_vnode_setsize(vp, 494 uio->uio_offset + n); 495 if (err) { 496 brelse(bp); 497 break; 498 } 499 } 500 } 501 502 if (!bp) { 503 err = EINTR; 504 break; 505 } 506 /* 507 * Issue a READ if B_CACHE is not set. In special-append 508 * mode, B_CACHE is based on the buffer prior to the write 509 * op and is typically set, avoiding the read. If a read 510 * is required in special append mode, the server will 511 * probably send us a short-read since we extended the file 512 * on our end, resulting in b_resid == 0 and, thusly, 513 * B_CACHE getting set. 514 * 515 * We can also avoid issuing the read if the write covers 516 * the entire buffer. We have to make sure the buffer state 517 * is reasonable in this case since we will not be initiating 518 * I/O. See the comments in kern/vfs_bio.c's getblk() for 519 * more information. 520 * 521 * B_CACHE may also be set due to the buffer being cached 522 * normally. 523 */ 524 525 if (on == 0 && n == bcount) { 526 bp->b_flags |= B_CACHE; 527 bp->b_flags &= ~B_INVAL; 528 bp->b_ioflags &= ~BIO_ERROR; 529 } 530 if ((bp->b_flags & B_CACHE) == 0) { 531 bp->b_iocmd = BIO_READ; 532 vfs_busy_pages(bp, 0); 533 fuse_io_strategy(vp, bp); 534 if ((err = bp->b_error)) { 535 brelse(bp); 536 break; 537 } 538 } 539 if (bp->b_wcred == NOCRED) 540 bp->b_wcred = crhold(cred); 541 542 /* 543 * If dirtyend exceeds file size, chop it down. This should 544 * not normally occur but there is an append race where it 545 * might occur XXX, so we log it. 546 * 547 * If the chopping creates a reverse-indexed or degenerate 548 * situation with dirtyoff/end, we 0 both of them. 549 */ 550 551 if (bp->b_dirtyend > bcount) { 552 SDT_PROBE2(fuse, , io, write_biobackend_append_race, 553 (long)bp->b_blkno * biosize, 554 bp->b_dirtyend - bcount); 555 bp->b_dirtyend = bcount; 556 } 557 if (bp->b_dirtyoff >= bp->b_dirtyend) 558 bp->b_dirtyoff = bp->b_dirtyend = 0; 559 560 /* 561 * If the new write will leave a contiguous dirty 562 * area, just update the b_dirtyoff and b_dirtyend, 563 * otherwise force a write rpc of the old dirty area. 564 * 565 * While it is possible to merge discontiguous writes due to 566 * our having a B_CACHE buffer ( and thus valid read data 567 * for the hole), we don't because it could lead to 568 * significant cache coherency problems with multiple clients, 569 * especially if locking is implemented later on. 570 * 571 * as an optimization we could theoretically maintain 572 * a linked list of discontinuous areas, but we would still 573 * have to commit them separately so there isn't much 574 * advantage to it except perhaps a bit of asynchronization. 575 */ 576 577 if (bp->b_dirtyend > 0 && 578 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 579 /* 580 * Yes, we mean it. Write out everything to "storage" 581 * immediately, without hesitation. (Apart from other 582 * reasons: the only way to know if a write is valid 583 * if its actually written out.) 584 */ 585 bwrite(bp); 586 if (bp->b_error == EINTR) { 587 err = EINTR; 588 break; 589 } 590 goto again; 591 } 592 err = uiomove((char *)bp->b_data + on, n, uio); 593 594 /* 595 * Since this block is being modified, it must be written 596 * again and not just committed. Since write clustering does 597 * not work for the stage 1 data write, only the stage 2 598 * commit rpc, we have to clear B_CLUSTEROK as well. 599 */ 600 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 601 602 if (err) { 603 bp->b_ioflags |= BIO_ERROR; 604 bp->b_error = err; 605 brelse(bp); 606 break; 607 } 608 /* 609 * Only update dirtyoff/dirtyend if not a degenerate 610 * condition. 611 */ 612 if (n) { 613 if (bp->b_dirtyend > 0) { 614 bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); 615 bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); 616 } else { 617 bp->b_dirtyoff = on; 618 bp->b_dirtyend = on + n; 619 } 620 vfs_bio_set_valid(bp, on, n); 621 } 622 err = bwrite(bp); 623 if (err) 624 break; 625 } while (uio->uio_resid > 0 && n > 0); 626 627 if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0) 628 fuse_vnode_savesize(vp, cred); 629 630 return (err); 631 } 632 633 int 634 fuse_io_strategy(struct vnode *vp, struct buf *bp) 635 { 636 struct fuse_filehandle *fufh; 637 struct fuse_vnode_data *fvdat = VTOFUD(vp); 638 struct ucred *cred; 639 struct uio *uiop; 640 struct uio uio; 641 struct iovec io; 642 int error = 0; 643 644 const int biosize = fuse_iosize(vp); 645 646 MPASS(vp->v_type == VREG || vp->v_type == VDIR); 647 MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); 648 649 error = fuse_filehandle_getrw(vp, 650 (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); 651 if (error) { 652 printf("FUSE: strategy: filehandles are closed\n"); 653 bp->b_ioflags |= BIO_ERROR; 654 bp->b_error = error; 655 return (error); 656 } 657 cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; 658 659 uiop = &uio; 660 uiop->uio_iov = &io; 661 uiop->uio_iovcnt = 1; 662 uiop->uio_segflg = UIO_SYSSPACE; 663 uiop->uio_td = curthread; 664 665 /* 666 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 667 * do this here so we do not have to do it in all the code that 668 * calls us. 669 */ 670 bp->b_flags &= ~B_INVAL; 671 bp->b_ioflags &= ~BIO_ERROR; 672 673 KASSERT(!(bp->b_flags & B_DONE), 674 ("fuse_io_strategy: bp %p already marked done", bp)); 675 if (bp->b_iocmd == BIO_READ) { 676 io.iov_len = uiop->uio_resid = bp->b_bcount; 677 io.iov_base = bp->b_data; 678 uiop->uio_rw = UIO_READ; 679 680 uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; 681 error = fuse_read_directbackend(vp, uiop, cred, fufh); 682 683 /* XXXCEM: Potentially invalid access to cached_attrs here */ 684 if ((!error && uiop->uio_resid) || 685 (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO && 686 uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 && 687 uiop->uio_offset >= fvdat->cached_attrs.va_size)) { 688 /* 689 * If we had a short read with no error, we must have 690 * hit a file hole. We should zero-fill the remainder. 691 * This can also occur if the server hits the file EOF. 692 * 693 * Holes used to be able to occur due to pending 694 * writes, but that is not possible any longer. 695 */ 696 int nread = bp->b_bcount - uiop->uio_resid; 697 int left = uiop->uio_resid; 698 699 if (error != 0) { 700 printf("FUSE: Fix broken io: offset %ju, " 701 " resid %zd, file size %ju/%ju\n", 702 (uintmax_t)uiop->uio_offset, 703 uiop->uio_resid, fvdat->filesize, 704 fvdat->cached_attrs.va_size); 705 error = 0; 706 } 707 if (left > 0) 708 bzero((char *)bp->b_data + nread, left); 709 uiop->uio_resid = 0; 710 } 711 if (error) { 712 bp->b_ioflags |= BIO_ERROR; 713 bp->b_error = error; 714 } 715 } else { 716 /* 717 * If we only need to commit, try to commit 718 */ 719 if (bp->b_flags & B_NEEDCOMMIT) { 720 SDT_PROBE2(fuse, , io, trace, 1, 721 "write: B_NEEDCOMMIT flags set"); 722 } 723 /* 724 * Setup for actual write 725 */ 726 if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > 727 fvdat->filesize) 728 bp->b_dirtyend = fvdat->filesize - 729 (off_t)bp->b_blkno * biosize; 730 731 if (bp->b_dirtyend > bp->b_dirtyoff) { 732 io.iov_len = uiop->uio_resid = bp->b_dirtyend 733 - bp->b_dirtyoff; 734 uiop->uio_offset = (off_t)bp->b_blkno * biosize 735 + bp->b_dirtyoff; 736 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 737 uiop->uio_rw = UIO_WRITE; 738 739 error = fuse_write_directbackend(vp, uiop, cred, fufh, 0); 740 741 if (error == EINTR || error == ETIMEDOUT 742 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 743 744 bp->b_flags &= ~(B_INVAL | B_NOCACHE); 745 if ((bp->b_flags & B_PAGING) == 0) { 746 bdirty(bp); 747 bp->b_flags &= ~B_DONE; 748 } 749 if ((error == EINTR || error == ETIMEDOUT) && 750 (bp->b_flags & B_ASYNC) == 0) 751 bp->b_flags |= B_EINTR; 752 } else { 753 if (error) { 754 bp->b_ioflags |= BIO_ERROR; 755 bp->b_flags |= B_INVAL; 756 bp->b_error = error; 757 } 758 bp->b_dirtyoff = bp->b_dirtyend = 0; 759 } 760 } else { 761 bp->b_resid = 0; 762 bufdone(bp); 763 return (0); 764 } 765 } 766 bp->b_resid = uiop->uio_resid; 767 bufdone(bp); 768 return (error); 769 } 770 771 int 772 fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td) 773 { 774 775 return (vn_fsync_buf(vp, waitfor)); 776 } 777 778 /* 779 * Flush and invalidate all dirty buffers. If another process is already 780 * doing the flush, just wait for completion. 781 */ 782 int 783 fuse_io_invalbuf(struct vnode *vp, struct thread *td) 784 { 785 struct fuse_vnode_data *fvdat = VTOFUD(vp); 786 int error = 0; 787 788 if (vp->v_iflag & VI_DOOMED) 789 return 0; 790 791 ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf"); 792 793 while (fvdat->flag & FN_FLUSHINPROG) { 794 struct proc *p = td->td_proc; 795 796 if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) 797 return EIO; 798 fvdat->flag |= FN_FLUSHWANT; 799 tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz); 800 error = 0; 801 if (p != NULL) { 802 PROC_LOCK(p); 803 if (SIGNOTEMPTY(p->p_siglist) || 804 SIGNOTEMPTY(td->td_siglist)) 805 error = EINTR; 806 PROC_UNLOCK(p); 807 } 808 if (error == EINTR) 809 return EINTR; 810 } 811 fvdat->flag |= FN_FLUSHINPROG; 812 813 if (vp->v_bufobj.bo_object != NULL) { 814 VM_OBJECT_WLOCK(vp->v_bufobj.bo_object); 815 vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); 816 VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object); 817 } 818 error = vinvalbuf(vp, V_SAVE, PCATCH, 0); 819 while (error) { 820 if (error == ERESTART || error == EINTR) { 821 fvdat->flag &= ~FN_FLUSHINPROG; 822 if (fvdat->flag & FN_FLUSHWANT) { 823 fvdat->flag &= ~FN_FLUSHWANT; 824 wakeup(&fvdat->flag); 825 } 826 return EINTR; 827 } 828 error = vinvalbuf(vp, V_SAVE, PCATCH, 0); 829 } 830 fvdat->flag &= ~FN_FLUSHINPROG; 831 if (fvdat->flag & FN_FLUSHWANT) { 832 fvdat->flag &= ~FN_FLUSHWANT; 833 wakeup(&fvdat->flag); 834 } 835 return (error); 836 } 837