1 /* 2 * Copyright (c) 2007-2009 Google Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are 7 * met: 8 * 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above 12 * copyright notice, this list of conditions and the following disclaimer 13 * in the documentation and/or other materials provided with the 14 * distribution. 15 * * Neither the name of Google Inc. nor the names of its 16 * contributors may be used to endorse or promote products derived from 17 * this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * Copyright (C) 2005 Csaba Henk. 32 * All rights reserved. 33 * 34 * Redistribution and use in source and binary forms, with or without 35 * modification, are permitted provided that the following conditions 36 * are met: 37 * 1. Redistributions of source code must retain the above copyright 38 * notice, this list of conditions and the following disclaimer. 39 * 2. Redistributions in binary form must reproduce the above copyright 40 * notice, this list of conditions and the following disclaimer in the 41 * documentation and/or other materials provided with the distribution. 42 * 43 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND 44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 46 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 53 * SUCH DAMAGE. 54 */ 55 56 #include <sys/cdefs.h> 57 __FBSDID("$FreeBSD$"); 58 59 #include <sys/types.h> 60 #include <sys/module.h> 61 #include <sys/systm.h> 62 #include <sys/errno.h> 63 #include <sys/param.h> 64 #include <sys/kernel.h> 65 #include <sys/conf.h> 66 #include <sys/uio.h> 67 #include <sys/malloc.h> 68 #include <sys/queue.h> 69 #include <sys/lock.h> 70 #include <sys/sx.h> 71 #include <sys/mutex.h> 72 #include <sys/rwlock.h> 73 #include <sys/proc.h> 74 #include <sys/mount.h> 75 #include <sys/vnode.h> 76 #include <sys/stat.h> 77 #include <sys/unistd.h> 78 #include <sys/filedesc.h> 79 #include <sys/file.h> 80 #include <sys/fcntl.h> 81 #include <sys/bio.h> 82 #include <sys/buf.h> 83 #include <sys/sysctl.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_extern.h> 87 #include <vm/pmap.h> 88 #include <vm/vm_map.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_object.h> 91 92 #include "fuse.h" 93 #include "fuse_file.h" 94 #include "fuse_node.h" 95 #include "fuse_internal.h" 96 #include "fuse_ipc.h" 97 #include "fuse_io.h" 98 99 #define FUSE_DEBUG_MODULE IO 100 #include "fuse_debug.h" 101 102 103 static int 104 fuse_read_directbackend(struct vnode *vp, struct uio *uio, 105 struct ucred *cred, struct fuse_filehandle *fufh); 106 static int 107 fuse_read_biobackend(struct vnode *vp, struct uio *uio, 108 struct ucred *cred, struct fuse_filehandle *fufh); 109 static int 110 fuse_write_directbackend(struct vnode *vp, struct uio *uio, 111 struct ucred *cred, struct fuse_filehandle *fufh); 112 static int 113 fuse_write_biobackend(struct vnode *vp, struct uio *uio, 114 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); 115 116 int 117 fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, 118 struct ucred *cred) 119 { 120 struct fuse_filehandle *fufh; 121 int err, directio; 122 123 MPASS(vp->v_type == VREG || vp->v_type == VDIR); 124 125 err = fuse_filehandle_getrw(vp, 126 (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); 127 if (err) { 128 printf("FUSE: io dispatch: filehandles are closed\n"); 129 return err; 130 } 131 /* 132 * Ideally, when the daemon asks for direct io at open time, the 133 * standard file flag should be set according to this, so that would 134 * just change the default mode, which later on could be changed via 135 * fcntl(2). 136 * But this doesn't work, the O_DIRECT flag gets cleared at some point 137 * (don't know where). So to make any use of the Fuse direct_io option, 138 * we hardwire it into the file's private data (similarly to Linux, 139 * btw.). 140 */ 141 directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); 142 143 switch (uio->uio_rw) { 144 case UIO_READ: 145 if (directio) { 146 FS_DEBUG("direct read of vnode %ju via file handle %ju\n", 147 (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id); 148 err = fuse_read_directbackend(vp, uio, cred, fufh); 149 } else { 150 FS_DEBUG("buffered read of vnode %ju\n", 151 (uintmax_t)VTOILLU(vp)); 152 err = fuse_read_biobackend(vp, uio, cred, fufh); 153 } 154 break; 155 case UIO_WRITE: 156 if (directio) { 157 FS_DEBUG("direct write of vnode %ju via file handle %ju\n", 158 (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id); 159 err = fuse_write_directbackend(vp, uio, cred, fufh); 160 } else { 161 FS_DEBUG("buffered write of vnode %ju\n", 162 (uintmax_t)VTOILLU(vp)); 163 err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag); 164 } 165 break; 166 default: 167 panic("uninterpreted mode passed to fuse_io_dispatch"); 168 } 169 170 return (err); 171 } 172 173 static int 174 fuse_read_biobackend(struct vnode *vp, struct uio *uio, 175 struct ucred *cred, struct fuse_filehandle *fufh) 176 { 177 struct buf *bp; 178 daddr_t lbn; 179 int bcount; 180 int err = 0, n = 0, on = 0; 181 off_t filesize; 182 183 const int biosize = fuse_iosize(vp); 184 185 FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n", 186 uio->uio_resid, uio->uio_offset, VTOFUD(vp)->filesize); 187 188 if (uio->uio_resid == 0) 189 return (0); 190 if (uio->uio_offset < 0) 191 return (EINVAL); 192 193 bcount = MIN(MAXBSIZE, biosize); 194 filesize = VTOFUD(vp)->filesize; 195 196 do { 197 if (fuse_isdeadfs(vp)) { 198 err = ENXIO; 199 break; 200 } 201 lbn = uio->uio_offset / biosize; 202 on = uio->uio_offset & (biosize - 1); 203 204 FS_DEBUG2G("biosize %d, lbn %d, on %d\n", biosize, (int)lbn, on); 205 206 /* 207 * Obtain the buffer cache block. Figure out the buffer size 208 * when we are at EOF. If we are modifying the size of the 209 * buffer based on an EOF condition we need to hold 210 * nfs_rslock() through obtaining the buffer to prevent 211 * a potential writer-appender from messing with n_size. 212 * Otherwise we may accidentally truncate the buffer and 213 * lose dirty data. 214 * 215 * Note that bcount is *not* DEV_BSIZE aligned. 216 */ 217 if ((off_t)lbn * biosize >= filesize) { 218 bcount = 0; 219 } else if ((off_t)(lbn + 1) * biosize > filesize) { 220 bcount = filesize - (off_t)lbn *biosize; 221 } 222 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 223 224 if (!bp) 225 return (EINTR); 226 227 /* 228 * If B_CACHE is not set, we must issue the read. If this 229 * fails, we return an error. 230 */ 231 232 if ((bp->b_flags & B_CACHE) == 0) { 233 bp->b_iocmd = BIO_READ; 234 vfs_busy_pages(bp, 0); 235 err = fuse_io_strategy(vp, bp); 236 if (err) { 237 brelse(bp); 238 return (err); 239 } 240 } 241 /* 242 * on is the offset into the current bp. Figure out how many 243 * bytes we can copy out of the bp. Note that bcount is 244 * NOT DEV_BSIZE aligned. 245 * 246 * Then figure out how many bytes we can copy into the uio. 247 */ 248 249 n = 0; 250 if (on < bcount) 251 n = MIN((unsigned)(bcount - on), uio->uio_resid); 252 if (n > 0) { 253 FS_DEBUG2G("feeding buffeater with %d bytes of buffer %p," 254 " saying %d was asked for\n", 255 n, bp->b_data + on, n + (int)bp->b_resid); 256 err = uiomove(bp->b_data + on, n, uio); 257 } 258 brelse(bp); 259 FS_DEBUG2G("end of turn, err %d, uio->uio_resid %zd, n %d\n", 260 err, uio->uio_resid, n); 261 } while (err == 0 && uio->uio_resid > 0 && n > 0); 262 263 return (err); 264 } 265 266 static int 267 fuse_read_directbackend(struct vnode *vp, struct uio *uio, 268 struct ucred *cred, struct fuse_filehandle *fufh) 269 { 270 struct fuse_dispatcher fdi; 271 struct fuse_read_in *fri; 272 int err = 0; 273 274 if (uio->uio_resid == 0) 275 return (0); 276 277 fdisp_init(&fdi, 0); 278 279 /* 280 * XXX In "normal" case we use an intermediate kernel buffer for 281 * transmitting data from daemon's context to ours. Eventually, we should 282 * get rid of this. Anyway, if the target uio lives in sysspace (we are 283 * called from pageops), and the input data doesn't need kernel-side 284 * processing (we are not called from readdir) we can already invoke 285 * an optimized, "peer-to-peer" I/O routine. 286 */ 287 while (uio->uio_resid > 0) { 288 fdi.iosize = sizeof(*fri); 289 fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); 290 fri = fdi.indata; 291 fri->fh = fufh->fh_id; 292 fri->offset = uio->uio_offset; 293 fri->size = MIN(uio->uio_resid, 294 fuse_get_mpdata(vp->v_mount)->max_read); 295 296 FS_DEBUG2G("fri->fh %ju, fri->offset %ju, fri->size %ju\n", 297 (uintmax_t)fri->fh, (uintmax_t)fri->offset, 298 (uintmax_t)fri->size); 299 300 if ((err = fdisp_wait_answ(&fdi))) 301 goto out; 302 303 FS_DEBUG2G("complete: got iosize=%d, requested fri.size=%zd; " 304 "resid=%zd offset=%ju\n", 305 fri->size, fdi.iosize, uio->uio_resid, 306 (uintmax_t)uio->uio_offset); 307 308 if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) 309 break; 310 if (fdi.iosize < fri->size) 311 break; 312 } 313 314 out: 315 fdisp_destroy(&fdi); 316 return (err); 317 } 318 319 static int 320 fuse_write_directbackend(struct vnode *vp, struct uio *uio, 321 struct ucred *cred, struct fuse_filehandle *fufh) 322 { 323 struct fuse_vnode_data *fvdat = VTOFUD(vp); 324 struct fuse_write_in *fwi; 325 struct fuse_dispatcher fdi; 326 size_t chunksize; 327 int diff; 328 int err = 0; 329 330 if (!uio->uio_resid) 331 return (0); 332 333 fdisp_init(&fdi, 0); 334 335 while (uio->uio_resid > 0) { 336 chunksize = MIN(uio->uio_resid, 337 fuse_get_mpdata(vp->v_mount)->max_write); 338 339 fdi.iosize = sizeof(*fwi) + chunksize; 340 fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); 341 342 fwi = fdi.indata; 343 fwi->fh = fufh->fh_id; 344 fwi->offset = uio->uio_offset; 345 fwi->size = chunksize; 346 347 if ((err = uiomove((char *)fdi.indata + sizeof(*fwi), 348 chunksize, uio))) 349 break; 350 351 if ((err = fdisp_wait_answ(&fdi))) 352 break; 353 354 diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size; 355 if (diff < 0) { 356 err = EINVAL; 357 break; 358 } 359 uio->uio_resid += diff; 360 uio->uio_offset -= diff; 361 if (uio->uio_offset > fvdat->filesize) 362 fuse_vnode_setsize(vp, cred, uio->uio_offset); 363 } 364 365 fdisp_destroy(&fdi); 366 367 return (err); 368 } 369 370 static int 371 fuse_write_biobackend(struct vnode *vp, struct uio *uio, 372 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) 373 { 374 struct fuse_vnode_data *fvdat = VTOFUD(vp); 375 struct buf *bp; 376 daddr_t lbn; 377 int bcount; 378 int n, on, err = 0; 379 380 const int biosize = fuse_iosize(vp); 381 382 KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); 383 FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n", 384 uio->uio_resid, uio->uio_offset, fvdat->filesize); 385 if (vp->v_type != VREG) 386 return (EIO); 387 if (uio->uio_offset < 0) 388 return (EINVAL); 389 if (uio->uio_resid == 0) 390 return (0); 391 if (ioflag & IO_APPEND) 392 uio_setoffset(uio, fvdat->filesize); 393 394 /* 395 * Find all of this file's B_NEEDCOMMIT buffers. If our writes 396 * would exceed the local maximum per-file write commit size when 397 * combined with those, we must decide whether to flush, 398 * go synchronous, or return err. We don't bother checking 399 * IO_UNIT -- we just make all writes atomic anyway, as there's 400 * no point optimizing for something that really won't ever happen. 401 */ 402 do { 403 if (fuse_isdeadfs(vp)) { 404 err = ENXIO; 405 break; 406 } 407 lbn = uio->uio_offset / biosize; 408 on = uio->uio_offset & (biosize - 1); 409 n = MIN((unsigned)(biosize - on), uio->uio_resid); 410 411 FS_DEBUG2G("lbn %ju, on %d, n %d, uio offset %ju, uio resid %zd\n", 412 (uintmax_t)lbn, on, n, 413 (uintmax_t)uio->uio_offset, uio->uio_resid); 414 415 again: 416 /* 417 * Handle direct append and file extension cases, calculate 418 * unaligned buffer size. 419 */ 420 if (uio->uio_offset == fvdat->filesize && n) { 421 /* 422 * Get the buffer (in its pre-append state to maintain 423 * B_CACHE if it was previously set). Resize the 424 * nfsnode after we have locked the buffer to prevent 425 * readers from reading garbage. 426 */ 427 bcount = on; 428 FS_DEBUG("getting block from OS, bcount %d\n", bcount); 429 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 430 431 if (bp != NULL) { 432 long save; 433 434 err = fuse_vnode_setsize(vp, cred, 435 uio->uio_offset + n); 436 if (err) { 437 brelse(bp); 438 break; 439 } 440 save = bp->b_flags & B_CACHE; 441 bcount += n; 442 allocbuf(bp, bcount); 443 bp->b_flags |= save; 444 } 445 } else { 446 /* 447 * Obtain the locked cache block first, and then 448 * adjust the file's size as appropriate. 449 */ 450 bcount = on + n; 451 if ((off_t)lbn * biosize + bcount < fvdat->filesize) { 452 if ((off_t)(lbn + 1) * biosize < fvdat->filesize) 453 bcount = biosize; 454 else 455 bcount = fvdat->filesize - 456 (off_t)lbn *biosize; 457 } 458 FS_DEBUG("getting block from OS, bcount %d\n", bcount); 459 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 460 if (bp && uio->uio_offset + n > fvdat->filesize) { 461 err = fuse_vnode_setsize(vp, cred, 462 uio->uio_offset + n); 463 if (err) { 464 brelse(bp); 465 break; 466 } 467 } 468 } 469 470 if (!bp) { 471 err = EINTR; 472 break; 473 } 474 /* 475 * Issue a READ if B_CACHE is not set. In special-append 476 * mode, B_CACHE is based on the buffer prior to the write 477 * op and is typically set, avoiding the read. If a read 478 * is required in special append mode, the server will 479 * probably send us a short-read since we extended the file 480 * on our end, resulting in b_resid == 0 and, thusly, 481 * B_CACHE getting set. 482 * 483 * We can also avoid issuing the read if the write covers 484 * the entire buffer. We have to make sure the buffer state 485 * is reasonable in this case since we will not be initiating 486 * I/O. See the comments in kern/vfs_bio.c's getblk() for 487 * more information. 488 * 489 * B_CACHE may also be set due to the buffer being cached 490 * normally. 491 */ 492 493 if (on == 0 && n == bcount) { 494 bp->b_flags |= B_CACHE; 495 bp->b_flags &= ~B_INVAL; 496 bp->b_ioflags &= ~BIO_ERROR; 497 } 498 if ((bp->b_flags & B_CACHE) == 0) { 499 bp->b_iocmd = BIO_READ; 500 vfs_busy_pages(bp, 0); 501 fuse_io_strategy(vp, bp); 502 if ((err = bp->b_error)) { 503 brelse(bp); 504 break; 505 } 506 } 507 if (bp->b_wcred == NOCRED) 508 bp->b_wcred = crhold(cred); 509 510 /* 511 * If dirtyend exceeds file size, chop it down. This should 512 * not normally occur but there is an append race where it 513 * might occur XXX, so we log it. 514 * 515 * If the chopping creates a reverse-indexed or degenerate 516 * situation with dirtyoff/end, we 0 both of them. 517 */ 518 519 if (bp->b_dirtyend > bcount) { 520 FS_DEBUG("FUSE append race @%lx:%d\n", 521 (long)bp->b_blkno * biosize, 522 bp->b_dirtyend - bcount); 523 bp->b_dirtyend = bcount; 524 } 525 if (bp->b_dirtyoff >= bp->b_dirtyend) 526 bp->b_dirtyoff = bp->b_dirtyend = 0; 527 528 /* 529 * If the new write will leave a contiguous dirty 530 * area, just update the b_dirtyoff and b_dirtyend, 531 * otherwise force a write rpc of the old dirty area. 532 * 533 * While it is possible to merge discontiguous writes due to 534 * our having a B_CACHE buffer ( and thus valid read data 535 * for the hole), we don't because it could lead to 536 * significant cache coherency problems with multiple clients, 537 * especially if locking is implemented later on. 538 * 539 * as an optimization we could theoretically maintain 540 * a linked list of discontinuous areas, but we would still 541 * have to commit them separately so there isn't much 542 * advantage to it except perhaps a bit of asynchronization. 543 */ 544 545 if (bp->b_dirtyend > 0 && 546 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 547 /* 548 * Yes, we mean it. Write out everything to "storage" 549 * immediately, without hesitation. (Apart from other 550 * reasons: the only way to know if a write is valid 551 * if its actually written out.) 552 */ 553 bwrite(bp); 554 if (bp->b_error == EINTR) { 555 err = EINTR; 556 break; 557 } 558 goto again; 559 } 560 err = uiomove((char *)bp->b_data + on, n, uio); 561 562 /* 563 * Since this block is being modified, it must be written 564 * again and not just committed. Since write clustering does 565 * not work for the stage 1 data write, only the stage 2 566 * commit rpc, we have to clear B_CLUSTEROK as well. 567 */ 568 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 569 570 if (err) { 571 bp->b_ioflags |= BIO_ERROR; 572 bp->b_error = err; 573 brelse(bp); 574 break; 575 } 576 /* 577 * Only update dirtyoff/dirtyend if not a degenerate 578 * condition. 579 */ 580 if (n) { 581 if (bp->b_dirtyend > 0) { 582 bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); 583 bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); 584 } else { 585 bp->b_dirtyoff = on; 586 bp->b_dirtyend = on + n; 587 } 588 vfs_bio_set_valid(bp, on, n); 589 } 590 err = bwrite(bp); 591 if (err) 592 break; 593 } while (uio->uio_resid > 0 && n > 0); 594 595 if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0) 596 fuse_vnode_savesize(vp, cred); 597 598 return (err); 599 } 600 601 int 602 fuse_io_strategy(struct vnode *vp, struct buf *bp) 603 { 604 struct fuse_filehandle *fufh; 605 struct fuse_vnode_data *fvdat = VTOFUD(vp); 606 struct ucred *cred; 607 struct uio *uiop; 608 struct uio uio; 609 struct iovec io; 610 int error = 0; 611 612 const int biosize = fuse_iosize(vp); 613 614 MPASS(vp->v_type == VREG || vp->v_type == VDIR); 615 MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); 616 FS_DEBUG("inode=%ju offset=%jd resid=%ld\n", 617 (uintmax_t)VTOI(vp), (intmax_t)(((off_t)bp->b_blkno) * biosize), 618 bp->b_bcount); 619 620 error = fuse_filehandle_getrw(vp, 621 (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); 622 if (error) { 623 printf("FUSE: strategy: filehandles are closed\n"); 624 bp->b_ioflags |= BIO_ERROR; 625 bp->b_error = error; 626 return (error); 627 } 628 cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; 629 630 uiop = &uio; 631 uiop->uio_iov = &io; 632 uiop->uio_iovcnt = 1; 633 uiop->uio_segflg = UIO_SYSSPACE; 634 uiop->uio_td = curthread; 635 636 /* 637 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 638 * do this here so we do not have to do it in all the code that 639 * calls us. 640 */ 641 bp->b_flags &= ~B_INVAL; 642 bp->b_ioflags &= ~BIO_ERROR; 643 644 KASSERT(!(bp->b_flags & B_DONE), 645 ("fuse_io_strategy: bp %p already marked done", bp)); 646 if (bp->b_iocmd == BIO_READ) { 647 io.iov_len = uiop->uio_resid = bp->b_bcount; 648 io.iov_base = bp->b_data; 649 uiop->uio_rw = UIO_READ; 650 651 uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; 652 error = fuse_read_directbackend(vp, uiop, cred, fufh); 653 654 if ((!error && uiop->uio_resid) || 655 (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO && 656 uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 && 657 uiop->uio_offset >= fvdat->cached_attrs.va_size)) { 658 /* 659 * If we had a short read with no error, we must have 660 * hit a file hole. We should zero-fill the remainder. 661 * This can also occur if the server hits the file EOF. 662 * 663 * Holes used to be able to occur due to pending 664 * writes, but that is not possible any longer. 665 */ 666 int nread = bp->b_bcount - uiop->uio_resid; 667 int left = uiop->uio_resid; 668 669 if (error != 0) { 670 printf("FUSE: Fix broken io: offset %ju, " 671 " resid %zd, file size %ju/%ju\n", 672 (uintmax_t)uiop->uio_offset, 673 uiop->uio_resid, fvdat->filesize, 674 fvdat->cached_attrs.va_size); 675 error = 0; 676 } 677 if (left > 0) 678 bzero((char *)bp->b_data + nread, left); 679 uiop->uio_resid = 0; 680 } 681 if (error) { 682 bp->b_ioflags |= BIO_ERROR; 683 bp->b_error = error; 684 } 685 } else { 686 /* 687 * If we only need to commit, try to commit 688 */ 689 if (bp->b_flags & B_NEEDCOMMIT) { 690 FS_DEBUG("write: B_NEEDCOMMIT flags set\n"); 691 } 692 /* 693 * Setup for actual write 694 */ 695 if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > 696 fvdat->filesize) 697 bp->b_dirtyend = fvdat->filesize - 698 (off_t)bp->b_blkno * biosize; 699 700 if (bp->b_dirtyend > bp->b_dirtyoff) { 701 io.iov_len = uiop->uio_resid = bp->b_dirtyend 702 - bp->b_dirtyoff; 703 uiop->uio_offset = (off_t)bp->b_blkno * biosize 704 + bp->b_dirtyoff; 705 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 706 uiop->uio_rw = UIO_WRITE; 707 708 error = fuse_write_directbackend(vp, uiop, cred, fufh); 709 710 if (error == EINTR || error == ETIMEDOUT 711 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 712 713 bp->b_flags &= ~(B_INVAL | B_NOCACHE); 714 if ((bp->b_flags & B_PAGING) == 0) { 715 bdirty(bp); 716 bp->b_flags &= ~B_DONE; 717 } 718 if ((error == EINTR || error == ETIMEDOUT) && 719 (bp->b_flags & B_ASYNC) == 0) 720 bp->b_flags |= B_EINTR; 721 } else { 722 if (error) { 723 bp->b_ioflags |= BIO_ERROR; 724 bp->b_flags |= B_INVAL; 725 bp->b_error = error; 726 } 727 bp->b_dirtyoff = bp->b_dirtyend = 0; 728 } 729 } else { 730 bp->b_resid = 0; 731 bufdone(bp); 732 return (0); 733 } 734 } 735 bp->b_resid = uiop->uio_resid; 736 bufdone(bp); 737 return (error); 738 } 739 740 int 741 fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td) 742 { 743 struct vop_fsync_args a = { 744 .a_vp = vp, 745 .a_waitfor = waitfor, 746 .a_td = td, 747 }; 748 749 return (vop_stdfsync(&a)); 750 } 751 752 /* 753 * Flush and invalidate all dirty buffers. If another process is already 754 * doing the flush, just wait for completion. 755 */ 756 int 757 fuse_io_invalbuf(struct vnode *vp, struct thread *td) 758 { 759 struct fuse_vnode_data *fvdat = VTOFUD(vp); 760 int error = 0; 761 762 if (vp->v_iflag & VI_DOOMED) 763 return 0; 764 765 ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf"); 766 767 while (fvdat->flag & FN_FLUSHINPROG) { 768 struct proc *p = td->td_proc; 769 770 if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) 771 return EIO; 772 fvdat->flag |= FN_FLUSHWANT; 773 tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz); 774 error = 0; 775 if (p != NULL) { 776 PROC_LOCK(p); 777 if (SIGNOTEMPTY(p->p_siglist) || 778 SIGNOTEMPTY(td->td_siglist)) 779 error = EINTR; 780 PROC_UNLOCK(p); 781 } 782 if (error == EINTR) 783 return EINTR; 784 } 785 fvdat->flag |= FN_FLUSHINPROG; 786 787 if (vp->v_bufobj.bo_object != NULL) { 788 VM_OBJECT_WLOCK(vp->v_bufobj.bo_object); 789 vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); 790 VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object); 791 } 792 error = vinvalbuf(vp, V_SAVE, PCATCH, 0); 793 while (error) { 794 if (error == ERESTART || error == EINTR) { 795 fvdat->flag &= ~FN_FLUSHINPROG; 796 if (fvdat->flag & FN_FLUSHWANT) { 797 fvdat->flag &= ~FN_FLUSHWANT; 798 wakeup(&fvdat->flag); 799 } 800 return EINTR; 801 } 802 error = vinvalbuf(vp, V_SAVE, PCATCH, 0); 803 } 804 fvdat->flag &= ~FN_FLUSHINPROG; 805 if (fvdat->flag & FN_FLUSHWANT) { 806 fvdat->flag &= ~FN_FLUSHWANT; 807 wakeup(&fvdat->flag); 808 } 809 return (error); 810 } 811