1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2007-2009 Google Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above 14 * copyright notice, this list of conditions and the following disclaimer 15 * in the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Google Inc. nor the names of its 18 * contributors may be used to endorse or promote products derived from 19 * this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Copyright (C) 2005 Csaba Henk. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 45 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 */ 57 58 #include <sys/cdefs.h> 59 __FBSDID("$FreeBSD$"); 60 61 #include <sys/types.h> 62 #include <sys/module.h> 63 #include <sys/systm.h> 64 #include <sys/errno.h> 65 #include <sys/param.h> 66 #include <sys/kernel.h> 67 #include <sys/conf.h> 68 #include <sys/uio.h> 69 #include <sys/malloc.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/sx.h> 73 #include <sys/mutex.h> 74 #include <sys/rwlock.h> 75 #include <sys/proc.h> 76 #include <sys/mount.h> 77 #include <sys/vnode.h> 78 #include <sys/stat.h> 79 #include <sys/unistd.h> 80 #include <sys/filedesc.h> 81 #include <sys/file.h> 82 #include <sys/fcntl.h> 83 #include <sys/bio.h> 84 #include <sys/buf.h> 85 #include <sys/sysctl.h> 86 87 #include <vm/vm.h> 88 #include <vm/vm_extern.h> 89 #include <vm/pmap.h> 90 #include <vm/vm_map.h> 91 #include <vm/vm_page.h> 92 #include <vm/vm_object.h> 93 94 #include "fuse.h" 95 #include "fuse_file.h" 96 #include "fuse_node.h" 97 #include "fuse_internal.h" 98 #include "fuse_ipc.h" 99 #include "fuse_io.h" 100 101 #define FUSE_DEBUG_MODULE IO 102 #include "fuse_debug.h" 103 104 105 static int 106 fuse_read_directbackend(struct vnode *vp, struct uio *uio, 107 struct ucred *cred, struct fuse_filehandle *fufh); 108 static int 109 fuse_read_biobackend(struct vnode *vp, struct uio *uio, 110 struct ucred *cred, struct fuse_filehandle *fufh); 111 static int 112 fuse_write_directbackend(struct vnode *vp, struct uio *uio, 113 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); 114 static int 115 fuse_write_biobackend(struct vnode *vp, struct uio *uio, 116 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag); 117 118 int 119 fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, 120 struct ucred *cred) 121 { 122 struct fuse_filehandle *fufh; 123 int err, directio; 124 125 MPASS(vp->v_type == VREG || vp->v_type == VDIR); 126 127 err = fuse_filehandle_getrw(vp, 128 (uio->uio_rw == UIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); 129 if (err) { 130 printf("FUSE: io dispatch: filehandles are closed\n"); 131 return err; 132 } 133 /* 134 * Ideally, when the daemon asks for direct io at open time, the 135 * standard file flag should be set according to this, so that would 136 * just change the default mode, which later on could be changed via 137 * fcntl(2). 138 * But this doesn't work, the O_DIRECT flag gets cleared at some point 139 * (don't know where). So to make any use of the Fuse direct_io option, 140 * we hardwire it into the file's private data (similarly to Linux, 141 * btw.). 142 */ 143 directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); 144 145 switch (uio->uio_rw) { 146 case UIO_READ: 147 if (directio) { 148 FS_DEBUG("direct read of vnode %ju via file handle %ju\n", 149 (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id); 150 err = fuse_read_directbackend(vp, uio, cred, fufh); 151 } else { 152 FS_DEBUG("buffered read of vnode %ju\n", 153 (uintmax_t)VTOILLU(vp)); 154 err = fuse_read_biobackend(vp, uio, cred, fufh); 155 } 156 break; 157 case UIO_WRITE: 158 if (directio) { 159 FS_DEBUG("direct write of vnode %ju via file handle %ju\n", 160 (uintmax_t)VTOILLU(vp), (uintmax_t)fufh->fh_id); 161 err = fuse_write_directbackend(vp, uio, cred, fufh, ioflag); 162 } else { 163 FS_DEBUG("buffered write of vnode %ju\n", 164 (uintmax_t)VTOILLU(vp)); 165 err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag); 166 } 167 break; 168 default: 169 panic("uninterpreted mode passed to fuse_io_dispatch"); 170 } 171 172 return (err); 173 } 174 175 static int 176 fuse_read_biobackend(struct vnode *vp, struct uio *uio, 177 struct ucred *cred, struct fuse_filehandle *fufh) 178 { 179 struct buf *bp; 180 daddr_t lbn; 181 int bcount; 182 int err = 0, n = 0, on = 0; 183 off_t filesize; 184 185 const int biosize = fuse_iosize(vp); 186 187 FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n", 188 uio->uio_resid, uio->uio_offset, VTOFUD(vp)->filesize); 189 190 if (uio->uio_resid == 0) 191 return (0); 192 if (uio->uio_offset < 0) 193 return (EINVAL); 194 195 bcount = MIN(MAXBSIZE, biosize); 196 filesize = VTOFUD(vp)->filesize; 197 198 do { 199 if (fuse_isdeadfs(vp)) { 200 err = ENXIO; 201 break; 202 } 203 lbn = uio->uio_offset / biosize; 204 on = uio->uio_offset & (biosize - 1); 205 206 FS_DEBUG2G("biosize %d, lbn %d, on %d\n", biosize, (int)lbn, on); 207 208 /* 209 * Obtain the buffer cache block. Figure out the buffer size 210 * when we are at EOF. If we are modifying the size of the 211 * buffer based on an EOF condition we need to hold 212 * nfs_rslock() through obtaining the buffer to prevent 213 * a potential writer-appender from messing with n_size. 214 * Otherwise we may accidentally truncate the buffer and 215 * lose dirty data. 216 * 217 * Note that bcount is *not* DEV_BSIZE aligned. 218 */ 219 if ((off_t)lbn * biosize >= filesize) { 220 bcount = 0; 221 } else if ((off_t)(lbn + 1) * biosize > filesize) { 222 bcount = filesize - (off_t)lbn *biosize; 223 } 224 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 225 226 if (!bp) 227 return (EINTR); 228 229 /* 230 * If B_CACHE is not set, we must issue the read. If this 231 * fails, we return an error. 232 */ 233 234 if ((bp->b_flags & B_CACHE) == 0) { 235 bp->b_iocmd = BIO_READ; 236 vfs_busy_pages(bp, 0); 237 err = fuse_io_strategy(vp, bp); 238 if (err) { 239 brelse(bp); 240 return (err); 241 } 242 } 243 /* 244 * on is the offset into the current bp. Figure out how many 245 * bytes we can copy out of the bp. Note that bcount is 246 * NOT DEV_BSIZE aligned. 247 * 248 * Then figure out how many bytes we can copy into the uio. 249 */ 250 251 n = 0; 252 if (on < bcount) 253 n = MIN((unsigned)(bcount - on), uio->uio_resid); 254 if (n > 0) { 255 FS_DEBUG2G("feeding buffeater with %d bytes of buffer %p," 256 " saying %d was asked for\n", 257 n, bp->b_data + on, n + (int)bp->b_resid); 258 err = uiomove(bp->b_data + on, n, uio); 259 } 260 brelse(bp); 261 FS_DEBUG2G("end of turn, err %d, uio->uio_resid %zd, n %d\n", 262 err, uio->uio_resid, n); 263 } while (err == 0 && uio->uio_resid > 0 && n > 0); 264 265 return (err); 266 } 267 268 static int 269 fuse_read_directbackend(struct vnode *vp, struct uio *uio, 270 struct ucred *cred, struct fuse_filehandle *fufh) 271 { 272 struct fuse_dispatcher fdi; 273 struct fuse_read_in *fri; 274 int err = 0; 275 276 if (uio->uio_resid == 0) 277 return (0); 278 279 fdisp_init(&fdi, 0); 280 281 /* 282 * XXX In "normal" case we use an intermediate kernel buffer for 283 * transmitting data from daemon's context to ours. Eventually, we should 284 * get rid of this. Anyway, if the target uio lives in sysspace (we are 285 * called from pageops), and the input data doesn't need kernel-side 286 * processing (we are not called from readdir) we can already invoke 287 * an optimized, "peer-to-peer" I/O routine. 288 */ 289 while (uio->uio_resid > 0) { 290 fdi.iosize = sizeof(*fri); 291 fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); 292 fri = fdi.indata; 293 fri->fh = fufh->fh_id; 294 fri->offset = uio->uio_offset; 295 fri->size = MIN(uio->uio_resid, 296 fuse_get_mpdata(vp->v_mount)->max_read); 297 298 FS_DEBUG2G("fri->fh %ju, fri->offset %ju, fri->size %ju\n", 299 (uintmax_t)fri->fh, (uintmax_t)fri->offset, 300 (uintmax_t)fri->size); 301 302 if ((err = fdisp_wait_answ(&fdi))) 303 goto out; 304 305 FS_DEBUG2G("complete: got iosize=%d, requested fri.size=%zd; " 306 "resid=%zd offset=%ju\n", 307 fri->size, fdi.iosize, uio->uio_resid, 308 (uintmax_t)uio->uio_offset); 309 310 if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) 311 break; 312 if (fdi.iosize < fri->size) 313 break; 314 } 315 316 out: 317 fdisp_destroy(&fdi); 318 return (err); 319 } 320 321 static int 322 fuse_write_directbackend(struct vnode *vp, struct uio *uio, 323 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) 324 { 325 struct fuse_vnode_data *fvdat = VTOFUD(vp); 326 struct fuse_write_in *fwi; 327 struct fuse_dispatcher fdi; 328 size_t chunksize; 329 int diff; 330 int err = 0; 331 332 if (uio->uio_resid == 0) 333 return (0); 334 if (ioflag & IO_APPEND) 335 uio_setoffset(uio, fvdat->filesize); 336 337 fdisp_init(&fdi, 0); 338 339 while (uio->uio_resid > 0) { 340 chunksize = MIN(uio->uio_resid, 341 fuse_get_mpdata(vp->v_mount)->max_write); 342 343 fdi.iosize = sizeof(*fwi) + chunksize; 344 fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); 345 346 fwi = fdi.indata; 347 fwi->fh = fufh->fh_id; 348 fwi->offset = uio->uio_offset; 349 fwi->size = chunksize; 350 351 if ((err = uiomove((char *)fdi.indata + sizeof(*fwi), 352 chunksize, uio))) 353 break; 354 355 if ((err = fdisp_wait_answ(&fdi))) 356 break; 357 358 diff = chunksize - ((struct fuse_write_out *)fdi.answ)->size; 359 if (diff < 0) { 360 err = EINVAL; 361 break; 362 } 363 uio->uio_resid += diff; 364 uio->uio_offset -= diff; 365 if (uio->uio_offset > fvdat->filesize) 366 fuse_vnode_setsize(vp, cred, uio->uio_offset); 367 } 368 369 fdisp_destroy(&fdi); 370 371 return (err); 372 } 373 374 static int 375 fuse_write_biobackend(struct vnode *vp, struct uio *uio, 376 struct ucred *cred, struct fuse_filehandle *fufh, int ioflag) 377 { 378 struct fuse_vnode_data *fvdat = VTOFUD(vp); 379 struct buf *bp; 380 daddr_t lbn; 381 int bcount; 382 int n, on, err = 0; 383 384 const int biosize = fuse_iosize(vp); 385 386 KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode")); 387 FS_DEBUG("resid=%zx offset=%jx fsize=%jx\n", 388 uio->uio_resid, uio->uio_offset, fvdat->filesize); 389 if (vp->v_type != VREG) 390 return (EIO); 391 if (uio->uio_offset < 0) 392 return (EINVAL); 393 if (uio->uio_resid == 0) 394 return (0); 395 if (ioflag & IO_APPEND) 396 uio_setoffset(uio, fvdat->filesize); 397 398 /* 399 * Find all of this file's B_NEEDCOMMIT buffers. If our writes 400 * would exceed the local maximum per-file write commit size when 401 * combined with those, we must decide whether to flush, 402 * go synchronous, or return err. We don't bother checking 403 * IO_UNIT -- we just make all writes atomic anyway, as there's 404 * no point optimizing for something that really won't ever happen. 405 */ 406 do { 407 if (fuse_isdeadfs(vp)) { 408 err = ENXIO; 409 break; 410 } 411 lbn = uio->uio_offset / biosize; 412 on = uio->uio_offset & (biosize - 1); 413 n = MIN((unsigned)(biosize - on), uio->uio_resid); 414 415 FS_DEBUG2G("lbn %ju, on %d, n %d, uio offset %ju, uio resid %zd\n", 416 (uintmax_t)lbn, on, n, 417 (uintmax_t)uio->uio_offset, uio->uio_resid); 418 419 again: 420 /* 421 * Handle direct append and file extension cases, calculate 422 * unaligned buffer size. 423 */ 424 if (uio->uio_offset == fvdat->filesize && n) { 425 /* 426 * Get the buffer (in its pre-append state to maintain 427 * B_CACHE if it was previously set). Resize the 428 * nfsnode after we have locked the buffer to prevent 429 * readers from reading garbage. 430 */ 431 bcount = on; 432 FS_DEBUG("getting block from OS, bcount %d\n", bcount); 433 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 434 435 if (bp != NULL) { 436 long save; 437 438 err = fuse_vnode_setsize(vp, cred, 439 uio->uio_offset + n); 440 if (err) { 441 brelse(bp); 442 break; 443 } 444 save = bp->b_flags & B_CACHE; 445 bcount += n; 446 allocbuf(bp, bcount); 447 bp->b_flags |= save; 448 } 449 } else { 450 /* 451 * Obtain the locked cache block first, and then 452 * adjust the file's size as appropriate. 453 */ 454 bcount = on + n; 455 if ((off_t)lbn * biosize + bcount < fvdat->filesize) { 456 if ((off_t)(lbn + 1) * biosize < fvdat->filesize) 457 bcount = biosize; 458 else 459 bcount = fvdat->filesize - 460 (off_t)lbn *biosize; 461 } 462 FS_DEBUG("getting block from OS, bcount %d\n", bcount); 463 bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); 464 if (bp && uio->uio_offset + n > fvdat->filesize) { 465 err = fuse_vnode_setsize(vp, cred, 466 uio->uio_offset + n); 467 if (err) { 468 brelse(bp); 469 break; 470 } 471 } 472 } 473 474 if (!bp) { 475 err = EINTR; 476 break; 477 } 478 /* 479 * Issue a READ if B_CACHE is not set. In special-append 480 * mode, B_CACHE is based on the buffer prior to the write 481 * op and is typically set, avoiding the read. If a read 482 * is required in special append mode, the server will 483 * probably send us a short-read since we extended the file 484 * on our end, resulting in b_resid == 0 and, thusly, 485 * B_CACHE getting set. 486 * 487 * We can also avoid issuing the read if the write covers 488 * the entire buffer. We have to make sure the buffer state 489 * is reasonable in this case since we will not be initiating 490 * I/O. See the comments in kern/vfs_bio.c's getblk() for 491 * more information. 492 * 493 * B_CACHE may also be set due to the buffer being cached 494 * normally. 495 */ 496 497 if (on == 0 && n == bcount) { 498 bp->b_flags |= B_CACHE; 499 bp->b_flags &= ~B_INVAL; 500 bp->b_ioflags &= ~BIO_ERROR; 501 } 502 if ((bp->b_flags & B_CACHE) == 0) { 503 bp->b_iocmd = BIO_READ; 504 vfs_busy_pages(bp, 0); 505 fuse_io_strategy(vp, bp); 506 if ((err = bp->b_error)) { 507 brelse(bp); 508 break; 509 } 510 } 511 if (bp->b_wcred == NOCRED) 512 bp->b_wcred = crhold(cred); 513 514 /* 515 * If dirtyend exceeds file size, chop it down. This should 516 * not normally occur but there is an append race where it 517 * might occur XXX, so we log it. 518 * 519 * If the chopping creates a reverse-indexed or degenerate 520 * situation with dirtyoff/end, we 0 both of them. 521 */ 522 523 if (bp->b_dirtyend > bcount) { 524 FS_DEBUG("FUSE append race @%lx:%d\n", 525 (long)bp->b_blkno * biosize, 526 bp->b_dirtyend - bcount); 527 bp->b_dirtyend = bcount; 528 } 529 if (bp->b_dirtyoff >= bp->b_dirtyend) 530 bp->b_dirtyoff = bp->b_dirtyend = 0; 531 532 /* 533 * If the new write will leave a contiguous dirty 534 * area, just update the b_dirtyoff and b_dirtyend, 535 * otherwise force a write rpc of the old dirty area. 536 * 537 * While it is possible to merge discontiguous writes due to 538 * our having a B_CACHE buffer ( and thus valid read data 539 * for the hole), we don't because it could lead to 540 * significant cache coherency problems with multiple clients, 541 * especially if locking is implemented later on. 542 * 543 * as an optimization we could theoretically maintain 544 * a linked list of discontinuous areas, but we would still 545 * have to commit them separately so there isn't much 546 * advantage to it except perhaps a bit of asynchronization. 547 */ 548 549 if (bp->b_dirtyend > 0 && 550 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 551 /* 552 * Yes, we mean it. Write out everything to "storage" 553 * immediately, without hesitation. (Apart from other 554 * reasons: the only way to know if a write is valid 555 * if its actually written out.) 556 */ 557 bwrite(bp); 558 if (bp->b_error == EINTR) { 559 err = EINTR; 560 break; 561 } 562 goto again; 563 } 564 err = uiomove((char *)bp->b_data + on, n, uio); 565 566 /* 567 * Since this block is being modified, it must be written 568 * again and not just committed. Since write clustering does 569 * not work for the stage 1 data write, only the stage 2 570 * commit rpc, we have to clear B_CLUSTEROK as well. 571 */ 572 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 573 574 if (err) { 575 bp->b_ioflags |= BIO_ERROR; 576 bp->b_error = err; 577 brelse(bp); 578 break; 579 } 580 /* 581 * Only update dirtyoff/dirtyend if not a degenerate 582 * condition. 583 */ 584 if (n) { 585 if (bp->b_dirtyend > 0) { 586 bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); 587 bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); 588 } else { 589 bp->b_dirtyoff = on; 590 bp->b_dirtyend = on + n; 591 } 592 vfs_bio_set_valid(bp, on, n); 593 } 594 err = bwrite(bp); 595 if (err) 596 break; 597 } while (uio->uio_resid > 0 && n > 0); 598 599 if (fuse_sync_resize && (fvdat->flag & FN_SIZECHANGE) != 0) 600 fuse_vnode_savesize(vp, cred); 601 602 return (err); 603 } 604 605 int 606 fuse_io_strategy(struct vnode *vp, struct buf *bp) 607 { 608 struct fuse_filehandle *fufh; 609 struct fuse_vnode_data *fvdat = VTOFUD(vp); 610 struct ucred *cred; 611 struct uio *uiop; 612 struct uio uio; 613 struct iovec io; 614 int error = 0; 615 616 const int biosize = fuse_iosize(vp); 617 618 MPASS(vp->v_type == VREG || vp->v_type == VDIR); 619 MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); 620 FS_DEBUG("inode=%ju offset=%jd resid=%ld\n", 621 (uintmax_t)VTOI(vp), (intmax_t)(((off_t)bp->b_blkno) * biosize), 622 bp->b_bcount); 623 624 error = fuse_filehandle_getrw(vp, 625 (bp->b_iocmd == BIO_READ) ? FUFH_RDONLY : FUFH_WRONLY, &fufh); 626 if (error) { 627 printf("FUSE: strategy: filehandles are closed\n"); 628 bp->b_ioflags |= BIO_ERROR; 629 bp->b_error = error; 630 return (error); 631 } 632 cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; 633 634 uiop = &uio; 635 uiop->uio_iov = &io; 636 uiop->uio_iovcnt = 1; 637 uiop->uio_segflg = UIO_SYSSPACE; 638 uiop->uio_td = curthread; 639 640 /* 641 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 642 * do this here so we do not have to do it in all the code that 643 * calls us. 644 */ 645 bp->b_flags &= ~B_INVAL; 646 bp->b_ioflags &= ~BIO_ERROR; 647 648 KASSERT(!(bp->b_flags & B_DONE), 649 ("fuse_io_strategy: bp %p already marked done", bp)); 650 if (bp->b_iocmd == BIO_READ) { 651 io.iov_len = uiop->uio_resid = bp->b_bcount; 652 io.iov_base = bp->b_data; 653 uiop->uio_rw = UIO_READ; 654 655 uiop->uio_offset = ((off_t)bp->b_blkno) * biosize; 656 error = fuse_read_directbackend(vp, uiop, cred, fufh); 657 658 if ((!error && uiop->uio_resid) || 659 (fsess_opt_brokenio(vnode_mount(vp)) && error == EIO && 660 uiop->uio_offset < fvdat->filesize && fvdat->filesize > 0 && 661 uiop->uio_offset >= fvdat->cached_attrs.va_size)) { 662 /* 663 * If we had a short read with no error, we must have 664 * hit a file hole. We should zero-fill the remainder. 665 * This can also occur if the server hits the file EOF. 666 * 667 * Holes used to be able to occur due to pending 668 * writes, but that is not possible any longer. 669 */ 670 int nread = bp->b_bcount - uiop->uio_resid; 671 int left = uiop->uio_resid; 672 673 if (error != 0) { 674 printf("FUSE: Fix broken io: offset %ju, " 675 " resid %zd, file size %ju/%ju\n", 676 (uintmax_t)uiop->uio_offset, 677 uiop->uio_resid, fvdat->filesize, 678 fvdat->cached_attrs.va_size); 679 error = 0; 680 } 681 if (left > 0) 682 bzero((char *)bp->b_data + nread, left); 683 uiop->uio_resid = 0; 684 } 685 if (error) { 686 bp->b_ioflags |= BIO_ERROR; 687 bp->b_error = error; 688 } 689 } else { 690 /* 691 * If we only need to commit, try to commit 692 */ 693 if (bp->b_flags & B_NEEDCOMMIT) { 694 FS_DEBUG("write: B_NEEDCOMMIT flags set\n"); 695 } 696 /* 697 * Setup for actual write 698 */ 699 if ((off_t)bp->b_blkno * biosize + bp->b_dirtyend > 700 fvdat->filesize) 701 bp->b_dirtyend = fvdat->filesize - 702 (off_t)bp->b_blkno * biosize; 703 704 if (bp->b_dirtyend > bp->b_dirtyoff) { 705 io.iov_len = uiop->uio_resid = bp->b_dirtyend 706 - bp->b_dirtyoff; 707 uiop->uio_offset = (off_t)bp->b_blkno * biosize 708 + bp->b_dirtyoff; 709 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 710 uiop->uio_rw = UIO_WRITE; 711 712 error = fuse_write_directbackend(vp, uiop, cred, fufh, 0); 713 714 if (error == EINTR || error == ETIMEDOUT 715 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 716 717 bp->b_flags &= ~(B_INVAL | B_NOCACHE); 718 if ((bp->b_flags & B_PAGING) == 0) { 719 bdirty(bp); 720 bp->b_flags &= ~B_DONE; 721 } 722 if ((error == EINTR || error == ETIMEDOUT) && 723 (bp->b_flags & B_ASYNC) == 0) 724 bp->b_flags |= B_EINTR; 725 } else { 726 if (error) { 727 bp->b_ioflags |= BIO_ERROR; 728 bp->b_flags |= B_INVAL; 729 bp->b_error = error; 730 } 731 bp->b_dirtyoff = bp->b_dirtyend = 0; 732 } 733 } else { 734 bp->b_resid = 0; 735 bufdone(bp); 736 return (0); 737 } 738 } 739 bp->b_resid = uiop->uio_resid; 740 bufdone(bp); 741 return (error); 742 } 743 744 int 745 fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td) 746 { 747 struct vop_fsync_args a = { 748 .a_vp = vp, 749 .a_waitfor = waitfor, 750 .a_td = td, 751 }; 752 753 return (vop_stdfsync(&a)); 754 } 755 756 /* 757 * Flush and invalidate all dirty buffers. If another process is already 758 * doing the flush, just wait for completion. 759 */ 760 int 761 fuse_io_invalbuf(struct vnode *vp, struct thread *td) 762 { 763 struct fuse_vnode_data *fvdat = VTOFUD(vp); 764 int error = 0; 765 766 if (vp->v_iflag & VI_DOOMED) 767 return 0; 768 769 ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf"); 770 771 while (fvdat->flag & FN_FLUSHINPROG) { 772 struct proc *p = td->td_proc; 773 774 if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) 775 return EIO; 776 fvdat->flag |= FN_FLUSHWANT; 777 tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz); 778 error = 0; 779 if (p != NULL) { 780 PROC_LOCK(p); 781 if (SIGNOTEMPTY(p->p_siglist) || 782 SIGNOTEMPTY(td->td_siglist)) 783 error = EINTR; 784 PROC_UNLOCK(p); 785 } 786 if (error == EINTR) 787 return EINTR; 788 } 789 fvdat->flag |= FN_FLUSHINPROG; 790 791 if (vp->v_bufobj.bo_object != NULL) { 792 VM_OBJECT_WLOCK(vp->v_bufobj.bo_object); 793 vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); 794 VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object); 795 } 796 error = vinvalbuf(vp, V_SAVE, PCATCH, 0); 797 while (error) { 798 if (error == ERESTART || error == EINTR) { 799 fvdat->flag &= ~FN_FLUSHINPROG; 800 if (fvdat->flag & FN_FLUSHWANT) { 801 fvdat->flag &= ~FN_FLUSHWANT; 802 wakeup(&fvdat->flag); 803 } 804 return EINTR; 805 } 806 error = vinvalbuf(vp, V_SAVE, PCATCH, 0); 807 } 808 fvdat->flag &= ~FN_FLUSHINPROG; 809 if (fvdat->flag & FN_FLUSHWANT) { 810 fvdat->flag &= ~FN_FLUSHWANT; 811 wakeup(&fvdat->flag); 812 } 813 return (error); 814 } 815