1 /* 2 * Copyright (c) 2007-2009 Google Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are 7 * met: 8 * 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above 12 * copyright notice, this list of conditions and the following disclaimer 13 * in the documentation and/or other materials provided with the 14 * distribution. 15 * * Neither the name of Google Inc. nor the names of its 16 * contributors may be used to endorse or promote products derived from 17 * this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * Copyright (C) 2005 Csaba Henk. 32 * All rights reserved. 33 * 34 * Redistribution and use in source and binary forms, with or without 35 * modification, are permitted provided that the following conditions 36 * are met: 37 * 1. Redistributions of source code must retain the above copyright 38 * notice, this list of conditions and the following disclaimer. 39 * 2. Redistributions in binary form must reproduce the above copyright 40 * notice, this list of conditions and the following disclaimer in the 41 * documentation and/or other materials provided with the distribution. 42 * 43 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND 44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 46 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 53 * SUCH DAMAGE. 54 */ 55 56 #include <sys/cdefs.h> 57 __FBSDID("$FreeBSD$"); 58 59 #include <sys/types.h> 60 #include <sys/module.h> 61 #include <sys/systm.h> 62 #include <sys/errno.h> 63 #include <sys/param.h> 64 #include <sys/kernel.h> 65 #include <sys/conf.h> 66 #include <sys/uio.h> 67 #include <sys/malloc.h> 68 #include <sys/queue.h> 69 #include <sys/lock.h> 70 #include <sys/sx.h> 71 #include <sys/mutex.h> 72 #include <sys/proc.h> 73 #include <sys/mount.h> 74 #include <sys/vnode.h> 75 #include <sys/stat.h> 76 #include <sys/unistd.h> 77 #include <sys/filedesc.h> 78 #include <sys/file.h> 79 #include <sys/fcntl.h> 80 #include <sys/bio.h> 81 #include <sys/buf.h> 82 #include <sys/sysctl.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_extern.h> 86 #include <vm/pmap.h> 87 #include <vm/vm_map.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_object.h> 90 91 #include "fuse.h" 92 #include "fuse_file.h" 93 #include "fuse_node.h" 94 #include "fuse_internal.h" 95 #include "fuse_ipc.h" 96 #include "fuse_io.h" 97 98 #define FUSE_DEBUG_MODULE IO 99 #include "fuse_debug.h" 100 101 102 static int 103 fuse_read_directbackend(struct vnode *vp, struct uio *uio, 104 struct ucred *cred, struct fuse_filehandle *fufh); 105 static int 106 fuse_read_biobackend(struct vnode *vp, struct uio *uio, 107 struct ucred *cred, struct fuse_filehandle *fufh); 108 static int 109 fuse_write_directbackend(struct vnode *vp, struct uio *uio, 110 struct ucred *cred, struct fuse_filehandle *fufh); 111 static int 112 fuse_write_biobackend(struct vnode *vp, struct uio *uio, 113 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); 114 115 int 116 fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, 117 struct ucred *cred) 118 { 119 struct fuse_filehandle *fufh; 120 int err, directio; 121 122 MPASS(vp->v_type == VREG || vp->v_type == VDIR); 123 124 err = fuse_filehandle_getrw(vp, 125 (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); 126 if (err) { 127 printf("FUSE: io dispatch: filehandles are closed\n"); 128 return err; 129 } 130 /* 131 * Ideally, when the daemon asks for direct io at open time, the 132 * standard file flag should be set according to this, so that would 133 * just change the default mode, which later on could be changed via 134 * fcntl(2). 135 * But this doesn't work, the O_DIRECT flag gets cleared at some point 136 * (don't know where). So to make any use of the Fuse direct_io option, 137 * we hardwire it into the file's private data (similarly to Linux, 138 * btw.). 139 */ 140 directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); 141 142 switch (uio->uio_rw) { 143 case UIO_READ: 144 if (directio) { 145 FS_DEBUG("direct read of vnode %ju via file handle %ju\n", 146 (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id); 147 err = fuse_read_directbackend(vp, uio, cred, fufh); 148 } else { 149 FS_DEBUG("buffered read of vnode %ju\n", 150 (uintmax_t)VTOILLU(vp)); 151 err = fuse_read_biobackend(vp, uio, cred, fufh); 152 } 153 break; 154 case UIO_WRITE: 155 if (directio) { 156 FS_DEBUG("direct write of vnode %ju via file handle %ju\n", 157 (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id); 158 err = fuse_write_directbackend(vp, uio, cred, fufh); 159 } else { 160 FS_DEBUG("buffered write of vnode %ju\n", 161 (uintmax_t)VTOILLU(vp)); 162 err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag); 163 } 164 break; 165 default: 166 panic("uninterpreted mode passed to fuse_io_dispatch"); 167 } 168 169 return (err); 170 } 171 172 static int 173 fuse_read_biobackend(struct vnode *vp, struct uio *uio, 174 struct ucred *cred, struct fuse_filehandle *fufh) 175 { 176 struct buf *bp; 177 daddr_t lbn; 178 int bcount; 179 int err = 0, n = 0, on = 0; 180 off_t filesize; 181 182 const int biosize = fuse_iosize(vp); 183 184 FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n", 185 uio->uio_resid, uio->uio_offset, VTOFUD(vp)->filesize); 186 187 if (uio->uio_resid == 0) 188 return (0); 189 if (uio->uio_offset < 0) 190 return (EINVAL); 191 192 bcount = MIN(MAXBSIZE, biosize); 193 filesize = VTOFUD(vp)->filesize; 194 195 do { 196 if (fuse_isdeadfs(vp)) { 197 err = ENXIO; 198 break; 199 } 200 lbn = uio->uio_offset / biosize; 201 on = uio->uio_offset & (biosize - 1); 202 203 FS_DEBUG2G("biosize %d, lbn %d, on %d\n", biosize, (int)lbn, on); 204 205 /* 206 * Obtain the buffer cache block. Figure out the buffer size 207 * when we are at EOF. If we are modifying the size of the 208 * buffer based on an EOF condition we need to hold 209 * nfs_rslock() through obtaining the buffer to prevent 210 * a potential writer-appender from messing with n_size. 211 * Otherwise we may accidently truncate the buffer and 212 * lose dirty data. 213 * 214 * Note that bcount is *not* DEV_BSIZE aligned. 215 */ 216 if ((off_t)lbn * biosize >= filesize) { 217 bcount = 0; 218 } else if ((off_t)(lbn + 1) * biosize > filesize) { 219 bcount = filesize - (off_t)lbn *biosize; 220 } 221 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 222 223 if (!bp) 224 return (EINTR); 225 226 /* 227 * If B_CACHE is not set, we must issue the read. If this 228 * fails, we return an error. 229 */ 230 231 if ((bp->b_flags & B_CACHE) == 0) { 232 bp->b_iocmd = BIO_READ; 233 vfs_busy_pages(bp, 0); 234 err = fuse_io_strategy(vp, bp); 235 if (err) { 236 brelse(bp); 237 return (err); 238 } 239 } 240 /* 241 * on is the offset into the current bp. Figure out how many 242 * bytes we can copy out of the bp. Note that bcount is 243 * NOT DEV_BSIZE aligned. 244 * 245 * Then figure out how many bytes we can copy into the uio. 246 */ 247 248 n = 0; 249 if (on < bcount) 250 n = MIN((unsigned)(bcount - on), uio->uio_resid); 251 if (n > 0) { 252 FS_DEBUG2G("feeding buffeater with %d bytes of buffer %p," 253 " saying %d was asked for\n", 254 n, bp->b_data + on, n + (int)bp->b_resid); 255 err = uiomove(bp->b_data + on, n, uio); 256 } 257 brelse(bp); 258 FS_DEBUG2G("end of turn, err %d, uio->uio_resid %zd, n %d\n", 259 err, uio->uio_resid, n); 260 } while (err == 0 && uio->uio_resid > 0 && n > 0); 261 262 return (err); 263 } 264 265 static int 266 fuse_read_directbackend(struct vnode *vp, struct uio *uio, 267 struct ucred *cred, struct fuse_filehandle *fufh) 268 { 269 struct fuse_dispatcher fdi; 270 struct fuse_read_in *fri; 271 int err = 0; 272 273 if (uio->uio_resid == 0) 274 return (0); 275 276 fdisp_init(&fdi, 0); 277 278 /* 279 * XXX In "normal" case we use an intermediate kernel buffer for 280 * transmitting data from daemon's context to ours. Eventually, we should 281 * get rid of this. Anyway, if the target uio lives in sysspace (we are 282 * called from pageops), and the input data doesn't need kernel-side 283 * processing (we are not called from readdir) we can already invoke 284 * an optimized, "peer-to-peer" I/O routine. 285 */ 286 while (uio->uio_resid > 0) { 287 fdi.iosize = sizeof(*fri); 288 fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); 289 fri = fdi.indata; 290 fri->fh = fufh->fh_id; 291 fri->offset = uio->uio_offset; 292 fri->size = MIN(uio->uio_resid, 293 fuse_get_mpdata(vp->v_mount)->max_read); 294 295 FS_DEBUG2G("fri->fh %ju, fri->offset %ju, fri->size %ju\n", 296 (uintmax_t)fri->fh, (uintmax_t)fri->offset, 297 (uintmax_t)fri->size); 298 299 if ((err = fdisp_wait_answ(&fdi))) 300 goto out; 301 302 FS_DEBUG2G("complete: got iosize=%d, requested fri.size=%zd; " 303 "resid=%zd offset=%ju\n", 304 fri->size, fdi.iosize, uio->uio_resid, 305 (uintmax_t)uio->uio_offset); 306 307 if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) 308 break; 309 if (fdi.iosize < fri->size) 310 break; 311 } 312 313 out: 314 fdisp_destroy(&fdi); 315 return (err); 316 } 317 318 static int 319 fuse_write_directbackend(struct vnode *vp, struct uio *uio, 320 struct ucred *cred, struct fuse_filehandle *fufh) 321 { 322 struct fuse_vnode_data *fvdat = VTOFUD(vp); 323 struct fuse_write_in *fwi; 324 struct fuse_dispatcher fdi; 325 size_t chunksize; 326 int diff; 327 int err = 0; 328 329 if (!uio->uio_resid) 330 return (0); 331 332 fdisp_init(&fdi, 0); 333 334 while (uio->uio_resid > 0) { 335 chunksize = MIN(uio->uio_resid, 336 fuse_get_mpdata(vp->v_mount)->max_write); 337 338 fdi.iosize = sizeof(*fwi) + chunksize; 339 fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); 340 341 fwi = fdi.indata; 342 fwi->fh = fufh->fh_id; 343 fwi->offset = uio->uio_offset; 344 fwi->size = chunksize; 345 346 if ((err = uiomove((char *)fdi.indata + sizeof(*fwi), 347 chunksize, uio))) 348 break; 349 350 if ((err = fdisp_wait_answ(&fdi))) 351 break; 352 353 diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size; 354 if (diff < 0) { 355 err = EINVAL; 356 break; 357 } 358 uio->uio_resid += diff; 359 uio->uio_offset -= diff; 360 if (uio->uio_offset > fvdat->filesize) 361 fuse_vnode_setsize(vp, cred, uio->uio_offset); 362 } 363 364 fdisp_destroy(&fdi); 365 366 return (err); 367 } 368 369 static int 370 fuse_write_biobackend(struct vnode *vp, struct uio *uio, 371 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) 372 { 373 struct fuse_vnode_data *fvdat = VTOFUD(vp); 374 struct buf *bp; 375 daddr_t lbn; 376 int bcount; 377 int n, on, err = 0; 378 379 const int biosize = fuse_iosize(vp); 380 381 KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); 382 FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n", 383 uio->uio_resid, uio->uio_offset, fvdat->filesize); 384 if (vp->v_type != VREG) 385 return (EIO); 386 if (uio->uio_offset < 0) 387 return (EINVAL); 388 if (uio->uio_resid == 0) 389 return (0); 390 if (ioflag & IO_APPEND) 391 uio_setoffset(uio, fvdat->filesize); 392 393 /* 394 * Find all of this file's B_NEEDCOMMIT buffers. If our writes 395 * would exceed the local maximum per-file write commit size when 396 * combined with those, we must decide whether to flush, 397 * go synchronous, or return err. We don't bother checking 398 * IO_UNIT -- we just make all writes atomic anyway, as there's 399 * no point optimizing for something that really won't ever happen. 400 */ 401 do { 402 if (fuse_isdeadfs(vp)) { 403 err = ENXIO; 404 break; 405 } 406 lbn = uio->uio_offset / biosize; 407 on = uio->uio_offset & (biosize - 1); 408 n = MIN((unsigned)(biosize - on), uio->uio_resid); 409 410 FS_DEBUG2G("lbn %ju, on %d, n %d, uio offset %ju, uio resid %zd\n", 411 (uintmax_t)lbn, on, n, 412 (uintmax_t)uio->uio_offset, uio->uio_resid); 413 414 again: 415 /* 416 * Handle direct append and file extension cases, calculate 417 * unaligned buffer size. 418 */ 419 if (uio->uio_offset == fvdat->filesize && n) { 420 /* 421 * Get the buffer (in its pre-append state to maintain 422 * B_CACHE if it was previously set). Resize the 423 * nfsnode after we have locked the buffer to prevent 424 * readers from reading garbage. 425 */ 426 bcount = on; 427 FS_DEBUG("getting block from OS, bcount %d\n", bcount); 428 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 429 430 if (bp != NULL) { 431 long save; 432 433 err = fuse_vnode_setsize(vp, cred, 434 uio->uio_offset + n); 435 if (err) { 436 brelse(bp); 437 break; 438 } 439 save = bp->b_flags & B_CACHE; 440 bcount += n; 441 allocbuf(bp, bcount); 442 bp->b_flags |= save; 443 } 444 } else { 445 /* 446 * Obtain the locked cache block first, and then 447 * adjust the file's size as appropriate. 448 */ 449 bcount = on + n; 450 if ((off_t)lbn * biosize + bcount < fvdat->filesize) { 451 if ((off_t)(lbn + 1) * biosize < fvdat->filesize) 452 bcount = biosize; 453 else 454 bcount = fvdat->filesize - 455 (off_t)lbn *biosize; 456 } 457 FS_DEBUG("getting block from OS, bcount %d\n", bcount); 458 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 459 if (bp && uio->uio_offset + n > fvdat->filesize) { 460 err = fuse_vnode_setsize(vp, cred, 461 uio->uio_offset + n); 462 if (err) { 463 brelse(bp); 464 break; 465 } 466 } 467 } 468 469 if (!bp) { 470 err = EINTR; 471 break; 472 } 473 /* 474 * Issue a READ if B_CACHE is not set. In special-append 475 * mode, B_CACHE is based on the buffer prior to the write 476 * op and is typically set, avoiding the read. If a read 477 * is required in special append mode, the server will 478 * probably send us a short-read since we extended the file 479 * on our end, resulting in b_resid == 0 and, thusly, 480 * B_CACHE getting set. 481 * 482 * We can also avoid issuing the read if the write covers 483 * the entire buffer. We have to make sure the buffer state 484 * is reasonable in this case since we will not be initiating 485 * I/O. See the comments in kern/vfs_bio.c's getblk() for 486 * more information. 487 * 488 * B_CACHE may also be set due to the buffer being cached 489 * normally. 490 */ 491 492 if (on == 0 && n == bcount) { 493 bp->b_flags |= B_CACHE; 494 bp->b_flags &= ~B_INVAL; 495 bp->b_ioflags &= ~BIO_ERROR; 496 } 497 if ((bp->b_flags & B_CACHE) == 0) { 498 bp->b_iocmd = BIO_READ; 499 vfs_busy_pages(bp, 0); 500 fuse_io_strategy(vp, bp); 501 if ((err = bp->b_error)) { 502 brelse(bp); 503 break; 504 } 505 } 506 if (bp->b_wcred == NOCRED) 507 bp->b_wcred = crhold(cred); 508 509 /* 510 * If dirtyend exceeds file size, chop it down. This should 511 * not normally occur but there is an append race where it 512 * might occur XXX, so we log it. 513 * 514 * If the chopping creates a reverse-indexed or degenerate 515 * situation with dirtyoff/end, we 0 both of them. 516 */ 517 518 if (bp->b_dirtyend > bcount) { 519 FS_DEBUG("FUSE append race @%lx:%d\n", 520 (long)bp->b_blkno * biosize, 521 bp->b_dirtyend - bcount); 522 bp->b_dirtyend = bcount; 523 } 524 if (bp->b_dirtyoff >= bp->b_dirtyend) 525 bp->b_dirtyoff = bp->b_dirtyend = 0; 526 527 /* 528 * If the new write will leave a contiguous dirty 529 * area, just update the b_dirtyoff and b_dirtyend, 530 * otherwise force a write rpc of the old dirty area. 531 * 532 * While it is possible to merge discontiguous writes due to 533 * our having a B_CACHE buffer ( and thus valid read data 534 * for the hole), we don't because it could lead to 535 * significant cache coherency problems with multiple clients, 536 * especially if locking is implemented later on. 537 * 538 * as an optimization we could theoretically maintain 539 * a linked list of discontinuous areas, but we would still 540 * have to commit them separately so there isn't much 541 * advantage to it except perhaps a bit of asynchronization. 542 */ 543 544 if (bp->b_dirtyend > 0 && 545 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 546 /* 547 * Yes, we mean it. Write out everything to "storage" 548 * immediatly, without hesitation. (Apart from other 549 * reasons: the only way to know if a write is valid 550 * if its actually written out.) 551 */ 552 bwrite(bp); 553 if (bp->b_error == EINTR) { 554 err = EINTR; 555 break; 556 } 557 goto again; 558 } 559 err = uiomove((char *)bp->b_data + on, n, uio); 560 561 /* 562 * Since this block is being modified, it must be written 563 * again and not just committed. Since write clustering does 564 * not work for the stage 1 data write, only the stage 2 565 * commit rpc, we have to clear B_CLUSTEROK as well. 566 */ 567 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 568 569 if (err) { 570 bp->b_ioflags |= BIO_ERROR; 571 bp->b_error = err; 572 brelse(bp); 573 break; 574 } 575 /* 576 * Only update dirtyoff/dirtyend if not a degenerate 577 * condition. 578 */ 579 if (n) { 580 if (bp->b_dirtyend > 0) { 581 bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); 582 bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); 583 } else { 584 bp->b_dirtyoff = on; 585 bp->b_dirtyend = on + n; 586 } 587 vfs_bio_set_valid(bp, on, n); 588 } 589 err = bwrite(bp); 590 if (err) 591 break; 592 } while (uio->uio_resid > 0 && n > 0); 593 594 if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0) 595 fuse_vnode_savesize(vp, cred); 596 597 return (err); 598 } 599 600 int 601 fuse_io_strategy(struct vnode *vp, struct buf *bp) 602 { 603 struct fuse_filehandle *fufh; 604 struct fuse_vnode_data *fvdat = VTOFUD(vp); 605 struct ucred *cred; 606 struct uio *uiop; 607 struct uio uio; 608 struct iovec io; 609 int error = 0; 610 611 const int biosize = fuse_iosize(vp); 612 613 MPASS(vp->v_type == VREG || vp->v_type == VDIR); 614 MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); 615 FS_DEBUG("inode=%ju offset=%jd resid=%ld\n", 616 (uintmax_t)VTOI(vp), (intmax_t)(((off_t)bp->b_blkno) * biosize), 617 bp->b_bcount); 618 619 error = fuse_filehandle_getrw(vp, 620 (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); 621 if (error) { 622 printf("FUSE: strategy: filehandles are closed\n"); 623 bp->b_ioflags |= BIO_ERROR; 624 bp->b_error = error; 625 return (error); 626 } 627 cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; 628 629 uiop = &uio; 630 uiop->uio_iov = &io; 631 uiop->uio_iovcnt = 1; 632 uiop->uio_segflg = UIO_SYSSPACE; 633 uiop->uio_td = curthread; 634 635 /* 636 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 637 * do this here so we do not have to do it in all the code that 638 * calls us. 639 */ 640 bp->b_flags &= ~B_INVAL; 641 bp->b_ioflags &= ~BIO_ERROR; 642 643 KASSERT(!(bp->b_flags & B_DONE), 644 ("fuse_io_strategy: bp %p already marked done", bp)); 645 if (bp->b_iocmd == BIO_READ) { 646 io.iov_len = uiop->uio_resid = bp->b_bcount; 647 io.iov_base = bp->b_data; 648 uiop->uio_rw = UIO_READ; 649 650 uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; 651 error = fuse_read_directbackend(vp, uiop, cred, fufh); 652 653 if ((!error && uiop->uio_resid) || 654 (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO && 655 uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 && 656 uiop->uio_offset >= fvdat->cached_attrs.va_size)) { 657 /* 658 * If we had a short read with no error, we must have 659 * hit a file hole. We should zero-fill the remainder. 660 * This can also occur if the server hits the file EOF. 661 * 662 * Holes used to be able to occur due to pending 663 * writes, but that is not possible any longer. 664 */ 665 int nread = bp->b_bcount - uiop->uio_resid; 666 int left = uiop->uio_resid; 667 668 if (error != 0) { 669 printf("FUSE: Fix broken io: offset %ju, " 670 " resid %zd, file size %ju/%ju\n", 671 (uintmax_t)uiop->uio_offset, 672 uiop->uio_resid, fvdat->filesize, 673 fvdat->cached_attrs.va_size); 674 error = 0; 675 } 676 if (left > 0) 677 bzero((char *)bp->b_data + nread, left); 678 uiop->uio_resid = 0; 679 } 680 if (error) { 681 bp->b_ioflags |= BIO_ERROR; 682 bp->b_error = error; 683 } 684 } else { 685 /* 686 * If we only need to commit, try to commit 687 */ 688 if (bp->b_flags & B_NEEDCOMMIT) { 689 FS_DEBUG("write: B_NEEDCOMMIT flags set\n"); 690 } 691 /* 692 * Setup for actual write 693 */ 694 if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > 695 fvdat->filesize) 696 bp->b_dirtyend = fvdat->filesize - 697 (off_t)bp->b_blkno * biosize; 698 699 if (bp->b_dirtyend > bp->b_dirtyoff) { 700 io.iov_len = uiop->uio_resid = bp->b_dirtyend 701 - bp->b_dirtyoff; 702 uiop->uio_offset = (off_t)bp->b_blkno * biosize 703 + bp->b_dirtyoff; 704 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 705 uiop->uio_rw = UIO_WRITE; 706 707 error = fuse_write_directbackend(vp, uiop, cred, fufh); 708 709 if (error == EINTR || error == ETIMEDOUT 710 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 711 712 bp->b_flags &= ~(B_INVAL | B_NOCACHE); 713 if ((bp->b_flags & B_PAGING) == 0) { 714 bdirty(bp); 715 bp->b_flags &= ~B_DONE; 716 } 717 if ((error == EINTR || error == ETIMEDOUT) && 718 (bp->b_flags & B_ASYNC) == 0) 719 bp->b_flags |= B_EINTR; 720 } else { 721 if (error) { 722 bp->b_ioflags |= BIO_ERROR; 723 bp->b_flags |= B_INVAL; 724 bp->b_error = error; 725 } 726 bp->b_dirtyoff = bp->b_dirtyend = 0; 727 } 728 } else { 729 bp->b_resid = 0; 730 bufdone(bp); 731 return (0); 732 } 733 } 734 bp->b_resid = uiop->uio_resid; 735 bufdone(bp); 736 return (error); 737 } 738 739 int 740 fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td) 741 { 742 struct vop_fsync_args a = { 743 .a_vp = vp, 744 .a_waitfor = waitfor, 745 .a_td = td, 746 }; 747 748 return (vop_stdfsync(&a)); 749 } 750 751 /* 752 * Flush and invalidate all dirty buffers. If another process is already 753 * doing the flush, just wait for completion. 754 */ 755 int 756 fuse_io_invalbuf(struct vnode *vp, struct thread *td) 757 { 758 struct fuse_vnode_data *fvdat = VTOFUD(vp); 759 int error = 0; 760 761 if (vp->v_iflag & VI_DOOMED) 762 return 0; 763 764 ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf"); 765 766 while (fvdat->flag & FN_FLUSHINPROG) { 767 struct proc *p = td->td_proc; 768 769 if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) 770 return EIO; 771 fvdat->flag |= FN_FLUSHWANT; 772 tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz); 773 error = 0; 774 if (p != NULL) { 775 PROC_LOCK(p); 776 if (SIGNOTEMPTY(p->p_siglist) || 777 SIGNOTEMPTY(td->td_siglist)) 778 error = EINTR; 779 PROC_UNLOCK(p); 780 } 781 if (error == EINTR) 782 return EINTR; 783 } 784 fvdat->flag |= FN_FLUSHINPROG; 785 786 if (vp->v_bufobj.bo_object != NULL) { 787 VM_OBJECT_LOCK(vp->v_bufobj.bo_object); 788 vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); 789 VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object); 790 } 791 error = vinvalbuf(vp, V_SAVE, PCATCH, 0); 792 while (error) { 793 if (error == ERESTART || error == EINTR) { 794 fvdat->flag &= ~FN_FLUSHINPROG; 795 if (fvdat->flag & FN_FLUSHWANT) { 796 fvdat->flag &= ~FN_FLUSHWANT; 797 wakeup(&fvdat->flag); 798 } 799 return EINTR; 800 } 801 error = vinvalbuf(vp, V_SAVE, PCATCH, 0); 802 } 803 fvdat->flag &= ~FN_FLUSHINPROG; 804 if (fvdat->flag & FN_FLUSHWANT) { 805 fvdat->flag &= ~FN_FLUSHWANT; 806 wakeup(&fvdat->flag); 807 } 808 return (error); 809 } 810