1 /* 2 * Copyright (c) 2007-2009 Google Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are 7 * met: 8 * 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above 12 * copyright notice, this list of conditions and the following disclaimer 13 * in the documentation and/or other materials provided with the 14 * distribution. 15 * * Neither the name of Google Inc. nor the names of its 16 * contributors may be used to endorse or promote products derived from 17 * this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * Copyright (C) 2005 Csaba Henk. 32 * All rights reserved. 33 * 34 * Redistribution and use in source and binary forms, with or without 35 * modification, are permitted provided that the following conditions 36 * are met: 37 * 1. Redistributions of source code must retain the above copyright 38 * notice, this list of conditions and the following disclaimer. 39 * 2. Redistributions in binary form must reproduce the above copyright 40 * notice, this list of conditions and the following disclaimer in the 41 * documentation and/or other materials provided with the distribution. 42 * 43 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND 44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 46 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 53 * SUCH DAMAGE. 54 */ 55 56 #include <sys/cdefs.h> 57 __FBSDID("$FreeBSD$"); 58 59 #include <sys/types.h> 60 #include <sys/module.h> 61 #include <sys/systm.h> 62 #include <sys/errno.h> 63 #include <sys/param.h> 64 #include <sys/kernel.h> 65 #include <sys/conf.h> 66 #include <sys/uio.h> 67 #include <sys/malloc.h> 68 #include <sys/queue.h> 69 #include <sys/lock.h> 70 #include <sys/sx.h> 71 #include <sys/mutex.h> 72 #include <sys/proc.h> 73 #include <sys/mount.h> 74 #include <sys/vnode.h> 75 #include <sys/stat.h> 76 #include <sys/unistd.h> 77 #include <sys/filedesc.h> 78 #include <sys/file.h> 79 #include <sys/fcntl.h> 80 #include <sys/bio.h> 81 #include <sys/buf.h> 82 #include <sys/sysctl.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_extern.h> 86 #include <vm/pmap.h> 87 #include <vm/vm_map.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_object.h> 90 #include <vm/vm_pager.h> 91 #include <vm/vnode_pager.h> 92 #include <vm/vm_object.h> 93 94 #include "fuse.h" 95 #include "fuse_file.h" 96 #include "fuse_node.h" 97 #include "fuse_internal.h" 98 #include "fuse_ipc.h" 99 #include "fuse_io.h" 100 101 #define FUSE_DEBUG_MODULE IO 102 #include "fuse_debug.h" 103 104 105 static int 106 fuse_read_directbackend(struct vnode *vp, struct uio *uio, 107 struct ucred *cred, struct fuse_filehandle *fufh); 108 static int 109 fuse_read_biobackend(struct vnode *vp, struct uio *uio, 110 struct ucred *cred, struct fuse_filehandle *fufh); 111 static int 112 fuse_write_directbackend(struct vnode *vp, struct uio *uio, 113 struct ucred *cred, struct fuse_filehandle *fufh); 114 static int 115 fuse_write_biobackend(struct vnode *vp, struct uio *uio, 116 struct ucred *cred, struct fuse_filehandle *fufh); 117 118 int 119 fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, 120 struct ucred *cred) 121 { 122 struct fuse_filehandle *fufh; 123 int err, directio; 124 125 MPASS(vp->v_type == VREG); 126 127 err = fuse_filehandle_getrw(vp, 128 (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); 129 if (err) { 130 printf("FUSE: io dispatch: filehandles are closed\n"); 131 return err; 132 } 133 /* 134 * Ideally, when the daemon asks for direct io at open time, the 135 * standard file flag should be set according to this, so that would 136 * just change the default mode, which later on could be changed via 137 * fcntl(2). 138 * But this doesn't work, the O_DIRECT flag gets cleared at some point 139 * (don't know where). So to make any use of the Fuse direct_io option, 140 * we hardwire it into the file's private data (similarly to Linux, 141 * btw.). 142 */ 143 directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); 144 145 switch (uio->uio_rw) { 146 case UIO_READ: 147 if (directio) { 148 FS_DEBUG("direct read of vnode %ju via file handle %ju\n", 149 (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id); 150 err = fuse_read_directbackend(vp, uio, cred, fufh); 151 } else { 152 FS_DEBUG("buffered read of vnode %ju\n", 153 (uintmax_t)VTOILLU(vp)); 154 err = fuse_read_biobackend(vp, uio, cred, fufh); 155 } 156 break; 157 case UIO_WRITE: 158 if (directio) { 159 FS_DEBUG("direct write of vnode %ju via file handle %ju\n", 160 (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id); 161 err = fuse_write_directbackend(vp, uio, cred, fufh); 162 fuse_invalidate_attr(vp); 163 } else { 164 FS_DEBUG("buffered write of vnode %ju\n", 165 (uintmax_t)VTOILLU(vp)); 166 err = fuse_write_biobackend(vp, uio, cred, fufh); 167 } 168 break; 169 default: 170 panic("uninterpreted mode passed to fuse_io_dispatch"); 171 } 172 173 return (err); 174 } 175 176 static int 177 fuse_read_biobackend(struct vnode *vp, struct uio *uio, 178 struct ucred *cred, struct fuse_filehandle *fufh) 179 { 180 struct buf *bp; 181 daddr_t lbn; 182 int bcount; 183 int err = 0, n = 0, on = 0; 184 off_t filesize; 185 186 const int biosize = fuse_iosize(vp); 187 188 FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n", 189 uio->uio_resid, uio->uio_offset, VTOFUD(vp)->filesize); 190 191 if (uio->uio_resid == 0) 192 return (0); 193 if (uio->uio_offset < 0) 194 return (EINVAL); 195 196 bcount = MIN(MAXBSIZE, biosize); 197 filesize = VTOFUD(vp)->filesize; 198 199 do { 200 if (fuse_isdeadfs(vp)) { 201 err = ENXIO; 202 break; 203 } 204 lbn = uio->uio_offset / biosize; 205 on = uio->uio_offset & (biosize - 1); 206 207 FS_DEBUG2G("biosize %d, lbn %d, on %d\n", biosize, (int)lbn, on); 208 209 /* 210 * Obtain the buffer cache block. Figure out the buffer size 211 * when we are at EOF. If we are modifying the size of the 212 * buffer based on an EOF condition we need to hold 213 * nfs_rslock() through obtaining the buffer to prevent 214 * a potential writer-appender from messing with n_size. 215 * Otherwise we may accidently truncate the buffer and 216 * lose dirty data. 217 * 218 * Note that bcount is *not* DEV_BSIZE aligned. 219 */ 220 if ((off_t)lbn * biosize >= filesize) { 221 bcount = 0; 222 } else if ((off_t)(lbn + 1) * biosize > filesize) { 223 bcount = filesize - (off_t)lbn *biosize; 224 } 225 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 226 227 if (!bp) 228 return (EINTR); 229 230 /* 231 * If B_CACHE is not set, we must issue the read. If this 232 * fails, we return an error. 233 */ 234 235 if ((bp->b_flags & B_CACHE) == 0) { 236 bp->b_iocmd = BIO_READ; 237 vfs_busy_pages(bp, 0); 238 err = fuse_io_strategy(vp, bp); 239 if (err) { 240 brelse(bp); 241 return (err); 242 } 243 } 244 /* 245 * on is the offset into the current bp. Figure out how many 246 * bytes we can copy out of the bp. Note that bcount is 247 * NOT DEV_BSIZE aligned. 248 * 249 * Then figure out how many bytes we can copy into the uio. 250 */ 251 252 n = 0; 253 if (on < bcount) 254 n = MIN((unsigned)(bcount - on), uio->uio_resid); 255 if (n > 0) { 256 FS_DEBUG2G("feeding buffeater with %d bytes of buffer %p," 257 " saying %d was asked for\n", 258 n, bp->b_data + on, n + (int)bp->b_resid); 259 err = uiomove(bp->b_data + on, n, uio); 260 } 261 brelse(bp); 262 FS_DEBUG2G("end of turn, err %d, uio->uio_resid %zd, n %d\n", 263 err, uio->uio_resid, n); 264 } while (err == 0 && uio->uio_resid > 0 && n > 0); 265 266 return (err); 267 } 268 269 static int 270 fuse_read_directbackend(struct vnode *vp, struct uio *uio, 271 struct ucred *cred, struct fuse_filehandle *fufh) 272 { 273 struct fuse_dispatcher fdi; 274 struct fuse_read_in *fri; 275 int err = 0; 276 277 if (uio->uio_resid == 0) 278 return (0); 279 280 fdisp_init(&fdi, 0); 281 282 /* 283 * XXX In "normal" case we use an intermediate kernel buffer for 284 * transmitting data from daemon's context to ours. Eventually, we should 285 * get rid of this. Anyway, if the target uio lives in sysspace (we are 286 * called from pageops), and the input data doesn't need kernel-side 287 * processing (we are not called from readdir) we can already invoke 288 * an optimized, "peer-to-peer" I/O routine. 289 */ 290 while (uio->uio_resid > 0) { 291 fdi.iosize = sizeof(*fri); 292 fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); 293 fri = fdi.indata; 294 fri->fh = fufh->fh_id; 295 fri->offset = uio->uio_offset; 296 fri->size = MIN(uio->uio_resid, 297 fuse_get_mpdata(vp->v_mount)->max_read); 298 299 FS_DEBUG2G("fri->fh %ju, fri->offset %ju, fri->size %ju\n", 300 (uintmax_t)fri->fh, (uintmax_t)fri->offset, 301 (uintmax_t)fri->size); 302 303 if ((err = fdisp_wait_answ(&fdi))) 304 goto out; 305 306 FS_DEBUG2G("complete: got iosize=%d, requested fri.size=%zd; " 307 "resid=%zd offset=%ju\n", 308 fri->size, fdi.iosize, uio->uio_resid, 309 (uintmax_t)uio->uio_offset); 310 311 if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) 312 break; 313 if (fdi.iosize < fri->size) 314 break; 315 } 316 317 out: 318 fdisp_destroy(&fdi); 319 return (err); 320 } 321 322 static int 323 fuse_write_directbackend(struct vnode *vp, struct uio *uio, 324 struct ucred *cred, struct fuse_filehandle *fufh) 325 { 326 struct fuse_vnode_data *fvdat = VTOFUD(vp); 327 struct fuse_write_in *fwi; 328 struct fuse_dispatcher fdi; 329 size_t chunksize; 330 int diff; 331 int err = 0; 332 333 if (!uio->uio_resid) 334 return (0); 335 336 fdisp_init(&fdi, 0); 337 338 while (uio->uio_resid > 0) { 339 chunksize = MIN(uio->uio_resid, 340 fuse_get_mpdata(vp->v_mount)->max_write); 341 342 fdi.iosize = sizeof(*fwi) + chunksize; 343 fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); 344 345 fwi = fdi.indata; 346 fwi->fh = fufh->fh_id; 347 fwi->offset = uio->uio_offset; 348 fwi->size = chunksize; 349 350 if ((err = uiomove((char *)fdi.indata + sizeof(*fwi), 351 chunksize, uio))) 352 break; 353 354 if ((err = fdisp_wait_answ(&fdi))) 355 break; 356 357 diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size; 358 if (diff < 0) { 359 err = EINVAL; 360 break; 361 } 362 uio->uio_resid += diff; 363 uio->uio_offset -= diff; 364 if (uio->uio_offset > fvdat->filesize) 365 fuse_vnode_setsize(vp, cred, uio->uio_offset); 366 } 367 368 fdisp_destroy(&fdi); 369 370 return (err); 371 } 372 373 static int 374 fuse_write_biobackend(struct vnode *vp, struct uio *uio, 375 struct ucred *cred, struct fuse_filehandle *fufh) 376 { 377 struct fuse_vnode_data *fvdat = VTOFUD(vp); 378 struct buf *bp; 379 daddr_t lbn; 380 int bcount; 381 int n, on, err = 0; 382 383 const int biosize = fuse_iosize(vp); 384 385 KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); 386 FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n", 387 uio->uio_resid, uio->uio_offset, fvdat->filesize); 388 if (vp->v_type != VREG) 389 return (EIO); 390 if (uio->uio_offset < 0) 391 return (EINVAL); 392 if (uio->uio_resid == 0) 393 return (0); 394 395 /* 396 * Find all of this file's B_NEEDCOMMIT buffers. If our writes 397 * would exceed the local maximum per-file write commit size when 398 * combined with those, we must decide whether to flush, 399 * go synchronous, or return err. We don't bother checking 400 * IO_UNIT -- we just make all writes atomic anyway, as there's 401 * no point optimizing for something that really won't ever happen. 402 */ 403 do { 404 if (fuse_isdeadfs(vp)) { 405 err = ENXIO; 406 break; 407 } 408 lbn = uio->uio_offset / biosize; 409 on = uio->uio_offset & (biosize - 1); 410 n = MIN((unsigned)(biosize - on), uio->uio_resid); 411 412 FS_DEBUG2G("lbn %ju, on %d, n %d, uio offset %ju, uio resid %zd\n", 413 (uintmax_t)lbn, on, n, 414 (uintmax_t)uio->uio_offset, uio->uio_resid); 415 416 again: 417 /* 418 * Handle direct append and file extension cases, calculate 419 * unaligned buffer size. 420 */ 421 if (uio->uio_offset == fvdat->filesize && n) { 422 /* 423 * Get the buffer (in its pre-append state to maintain 424 * B_CACHE if it was previously set). Resize the 425 * nfsnode after we have locked the buffer to prevent 426 * readers from reading garbage. 427 */ 428 bcount = on; 429 FS_DEBUG("getting block from OS, bcount %d\n", bcount); 430 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 431 432 if (bp != NULL) { 433 long save; 434 435 err = fuse_vnode_setsize(vp, cred, 436 uio->uio_offset + n); 437 if (err) { 438 brelse(bp); 439 break; 440 } 441 save = bp->b_flags & B_CACHE; 442 bcount += n; 443 allocbuf(bp, bcount); 444 bp->b_flags |= save; 445 } 446 } else { 447 /* 448 * Obtain the locked cache block first, and then 449 * adjust the file's size as appropriate. 450 */ 451 bcount = on + n; 452 if ((off_t)lbn * biosize + bcount < fvdat->filesize) { 453 if ((off_t)(lbn + 1) * biosize < fvdat->filesize) 454 bcount = biosize; 455 else 456 bcount = fvdat->filesize - 457 (off_t)lbn *biosize; 458 } 459 FS_DEBUG("getting block from OS, bcount %d\n", bcount); 460 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 461 if (bp && uio->uio_offset + n > fvdat->filesize) { 462 err = fuse_vnode_setsize(vp, cred, 463 uio->uio_offset + n); 464 if (err) { 465 brelse(bp); 466 break; 467 } 468 } 469 } 470 471 if (!bp) { 472 err = EINTR; 473 break; 474 } 475 /* 476 * Issue a READ if B_CACHE is not set. In special-append 477 * mode, B_CACHE is based on the buffer prior to the write 478 * op and is typically set, avoiding the read. If a read 479 * is required in special append mode, the server will 480 * probably send us a short-read since we extended the file 481 * on our end, resulting in b_resid == 0 and, thusly, 482 * B_CACHE getting set. 483 * 484 * We can also avoid issuing the read if the write covers 485 * the entire buffer. We have to make sure the buffer state 486 * is reasonable in this case since we will not be initiating 487 * I/O. See the comments in kern/vfs_bio.c's getblk() for 488 * more information. 489 * 490 * B_CACHE may also be set due to the buffer being cached 491 * normally. 492 */ 493 494 if (on == 0 && n == bcount) { 495 bp->b_flags |= B_CACHE; 496 bp->b_flags &= ~B_INVAL; 497 bp->b_ioflags &= ~BIO_ERROR; 498 } 499 if ((bp->b_flags & B_CACHE) == 0) { 500 bp->b_iocmd = BIO_READ; 501 vfs_busy_pages(bp, 0); 502 fuse_io_strategy(vp, bp); 503 if ((err = bp->b_error)) { 504 brelse(bp); 505 break; 506 } 507 } 508 if (bp->b_wcred == NOCRED) 509 bp->b_wcred = crhold(cred); 510 511 /* 512 * If dirtyend exceeds file size, chop it down. This should 513 * not normally occur but there is an append race where it 514 * might occur XXX, so we log it. 515 * 516 * If the chopping creates a reverse-indexed or degenerate 517 * situation with dirtyoff/end, we 0 both of them. 518 */ 519 520 if (bp->b_dirtyend > bcount) { 521 FS_DEBUG("FUSE append race @%lx:%d\n", 522 (long)bp->b_blkno * biosize, 523 bp->b_dirtyend - bcount); 524 bp->b_dirtyend = bcount; 525 } 526 if (bp->b_dirtyoff >= bp->b_dirtyend) 527 bp->b_dirtyoff = bp->b_dirtyend = 0; 528 529 /* 530 * If the new write will leave a contiguous dirty 531 * area, just update the b_dirtyoff and b_dirtyend, 532 * otherwise force a write rpc of the old dirty area. 533 * 534 * While it is possible to merge discontiguous writes due to 535 * our having a B_CACHE buffer ( and thus valid read data 536 * for the hole), we don't because it could lead to 537 * significant cache coherency problems with multiple clients, 538 * especially if locking is implemented later on. 539 * 540 * as an optimization we could theoretically maintain 541 * a linked list of discontinuous areas, but we would still 542 * have to commit them separately so there isn't much 543 * advantage to it except perhaps a bit of asynchronization. 544 */ 545 546 if (bp->b_dirtyend > 0 && 547 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 548 /* 549 * Yes, we mean it. Write out everything to "storage" 550 * immediatly, without hesitation. (Apart from other 551 * reasons: the only way to know if a write is valid 552 * if its actually written out.) 553 */ 554 bwrite(bp); 555 if (bp->b_error == EINTR) { 556 err = EINTR; 557 break; 558 } 559 goto again; 560 } 561 err = uiomove((char *)bp->b_data + on, n, uio); 562 563 /* 564 * Since this block is being modified, it must be written 565 * again and not just committed. Since write clustering does 566 * not work for the stage 1 data write, only the stage 2 567 * commit rpc, we have to clear B_CLUSTEROK as well. 568 */ 569 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 570 571 if (err) { 572 bp->b_ioflags |= BIO_ERROR; 573 bp->b_error = err; 574 brelse(bp); 575 break; 576 } 577 /* 578 * Only update dirtyoff/dirtyend if not a degenerate 579 * condition. 580 */ 581 if (n) { 582 if (bp->b_dirtyend > 0) { 583 bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); 584 bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); 585 } else { 586 bp->b_dirtyoff = on; 587 bp->b_dirtyend = on + n; 588 } 589 vfs_bio_set_valid(bp, on, n); 590 } 591 err = bwrite(bp); 592 if (err) 593 break; 594 } while (uio->uio_resid > 0 && n > 0); 595 596 if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0) 597 fuse_vnode_savesize(vp, cred); 598 599 return (err); 600 } 601 602 int 603 fuse_io_strategy(struct vnode *vp, struct buf *bp) 604 { 605 struct fuse_filehandle *fufh; 606 struct fuse_vnode_data *fvdat = VTOFUD(vp); 607 struct ucred *cred; 608 struct uio *uiop; 609 struct uio uio; 610 struct iovec io; 611 int error = 0; 612 613 const int biosize = fuse_iosize(vp); 614 615 MPASS(vp->v_type == VREG); 616 MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); 617 FS_DEBUG("inode=%ju offset=%jd resid=%ld\n", 618 (uintmax_t)VTOI(vp), (intmax_t)(((off_t)bp->b_blkno) * biosize), 619 bp->b_bcount); 620 621 error = fuse_filehandle_getrw(vp, 622 (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); 623 if (error) { 624 printf("FUSE: strategy: filehandles are closed\n"); 625 bp->b_ioflags |= BIO_ERROR; 626 bp->b_error = error; 627 return (error); 628 } 629 cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; 630 631 uiop = &uio; 632 uiop->uio_iov = &io; 633 uiop->uio_iovcnt = 1; 634 uiop->uio_segflg = UIO_SYSSPACE; 635 uiop->uio_td = curthread; 636 637 /* 638 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 639 * do this here so we do not have to do it in all the code that 640 * calls us. 641 */ 642 bp->b_flags &= ~B_INVAL; 643 bp->b_ioflags &= ~BIO_ERROR; 644 645 KASSERT(!(bp->b_flags & B_DONE), 646 ("fuse_io_strategy: bp %p already marked done", bp)); 647 if (bp->b_iocmd == BIO_READ) { 648 io.iov_len = uiop->uio_resid = bp->b_bcount; 649 io.iov_base = bp->b_data; 650 uiop->uio_rw = UIO_READ; 651 652 uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; 653 error = fuse_read_directbackend(vp, uiop, cred, fufh); 654 655 if ((!error && uiop->uio_resid) || 656 (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO && 657 uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 && 658 uiop->uio_offset >= fvdat->cached_attrs.va_size)) { 659 /* 660 * If we had a short read with no error, we must have 661 * hit a file hole. We should zero-fill the remainder. 662 * This can also occur if the server hits the file EOF. 663 * 664 * Holes used to be able to occur due to pending 665 * writes, but that is not possible any longer. 666 */ 667 int nread = bp->b_bcount - uiop->uio_resid; 668 int left = uiop->uio_resid; 669 670 if (error != 0) { 671 printf("FUSE: Fix broken io: offset %ju, " 672 " resid %zd, file size %ju/%ju\n", 673 (uintmax_t)uiop->uio_offset, 674 uiop->uio_resid, fvdat->filesize, 675 fvdat->cached_attrs.va_size); 676 error = 0; 677 } 678 if (left > 0) 679 bzero((char *)bp->b_data + nread, left); 680 uiop->uio_resid = 0; 681 } 682 if (error) { 683 bp->b_ioflags |= BIO_ERROR; 684 bp->b_error = error; 685 } 686 } else { 687 /* 688 * If we only need to commit, try to commit 689 */ 690 if (bp->b_flags & B_NEEDCOMMIT) { 691 FS_DEBUG("write: B_NEEDCOMMIT flags set\n"); 692 } 693 /* 694 * Setup for actual write 695 */ 696 if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > 697 fvdat->filesize) 698 bp->b_dirtyend = fvdat->filesize - 699 (off_t)bp->b_blkno * biosize; 700 701 if (bp->b_dirtyend > bp->b_dirtyoff) { 702 io.iov_len = uiop->uio_resid = bp->b_dirtyend 703 - bp->b_dirtyoff; 704 uiop->uio_offset = (off_t)bp->b_blkno * biosize 705 + bp->b_dirtyoff; 706 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 707 uiop->uio_rw = UIO_WRITE; 708 709 error = fuse_write_directbackend(vp, uiop, cred, fufh); 710 711 if (error == EINTR || error == ETIMEDOUT 712 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 713 714 bp->b_flags &= ~(B_INVAL | B_NOCACHE); 715 if ((bp->b_flags & B_PAGING) == 0) { 716 bdirty(bp); 717 bp->b_flags &= ~B_DONE; 718 } 719 if ((error == EINTR || error == ETIMEDOUT) && 720 (bp->b_flags & B_ASYNC) == 0) 721 bp->b_flags |= B_EINTR; 722 } else { 723 if (error) { 724 bp->b_ioflags |= BIO_ERROR; 725 bp->b_flags |= B_INVAL; 726 bp->b_error = error; 727 } 728 bp->b_dirtyoff = bp->b_dirtyend = 0; 729 } 730 } else { 731 bp->b_resid = 0; 732 bufdone(bp); 733 return (0); 734 } 735 } 736 bp->b_resid = uiop->uio_resid; 737 bufdone(bp); 738 return (error); 739 } 740 741 int 742 fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td) 743 { 744 struct vop_fsync_args a = { 745 .a_vp = vp, 746 .a_waitfor = waitfor, 747 .a_td = td, 748 }; 749 750 return (vop_stdfsync(&a)); 751 } 752 753 /* 754 * Flush and invalidate all dirty buffers. If another process is already 755 * doing the flush, just wait for completion. 756 */ 757 int 758 fuse_io_invalbuf(struct vnode *vp, struct thread *td) 759 { 760 struct fuse_vnode_data *fvdat = VTOFUD(vp); 761 int error = 0; 762 763 if (vp->v_iflag & VI_DOOMED) 764 return 0; 765 766 ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf"); 767 768 while (fvdat->flag & FN_FLUSHINPROG) { 769 struct proc *p = td->td_proc; 770 771 if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) 772 return EIO; 773 fvdat->flag |= FN_FLUSHWANT; 774 tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz); 775 error = 0; 776 if (p != NULL) { 777 PROC_LOCK(p); 778 if (SIGNOTEMPTY(p->p_siglist) || 779 SIGNOTEMPTY(td->td_siglist)) 780 error = EINTR; 781 PROC_UNLOCK(p); 782 } 783 if (error == EINTR) 784 return EINTR; 785 } 786 fvdat->flag |= FN_FLUSHINPROG; 787 788 if (vp->v_bufobj.bo_object != NULL) { 789 VM_OBJECT_LOCK(vp->v_bufobj.bo_object); 790 vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); 791 VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object); 792 } 793 error = vinvalbuf(vp, V_SAVE, PCATCH, 0); 794 while (error) { 795 if (error == ERESTART || error == EINTR) { 796 fvdat->flag &= ~FN_FLUSHINPROG; 797 if (fvdat->flag & FN_FLUSHWANT) { 798 fvdat->flag &= ~FN_FLUSHWANT; 799 wakeup(&fvdat->flag); 800 } 801 return EINTR; 802 } 803 error = vinvalbuf(vp, V_SAVE, PCATCH, 0); 804 } 805 fvdat->flag &= ~FN_FLUSHINPROG; 806 if (fvdat->flag & FN_FLUSHWANT) { 807 fvdat->flag &= ~FN_FLUSHWANT; 808 wakeup(&fvdat->flag); 809 } 810 return (error); 811 } 812