1 /*- 2 * Copyright (c) 2000-2003 Tor Egge 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/fcntl.h> 33 #include <sys/file.h> 34 #include <sys/stat.h> 35 #include <sys/proc.h> 36 #include <sys/limits.h> 37 #include <sys/mount.h> 38 #include <sys/namei.h> 39 #include <sys/vnode.h> 40 #include <sys/conf.h> 41 #include <sys/filio.h> 42 #include <sys/ttycom.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <ufs/ufs/extattr.h> 46 #include <ufs/ufs/quota.h> 47 #include <ufs/ufs/inode.h> 48 #include <ufs/ufs/ufsmount.h> 49 #include <ufs/ufs/ufs_extern.h> 50 #include <ufs/ffs/fs.h> 51 52 #include <vm/vm.h> 53 #include <vm/vm_extern.h> 54 #include <vm/vm_object.h> 55 #include <sys/kernel.h> 56 #include <sys/sysctl.h> 57 58 static int ffs_rawread_readahead(struct vnode *vp, 59 caddr_t udata, 60 off_t offset, 61 size_t len, 62 struct thread *td, 63 struct buf *bp, 64 caddr_t sa); 65 static int ffs_rawread_main(struct vnode *vp, 66 struct uio *uio); 67 68 static int ffs_rawread_sync(struct vnode *vp, struct thread *td); 69 70 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 71 72 void ffs_rawread_setup(void); 73 74 static void ffs_rawreadwakeup(struct buf *bp); 75 76 77 SYSCTL_DECL(_vfs_ffs); 78 79 static int ffsrawbufcnt = 4; 80 SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 81 "Buffers available for raw reads"); 82 83 static int allowrawread = 1; 84 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 85 "Flag to enable raw reads"); 86 87 static int rawreadahead = 1; 88 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 89 "Flag to enable readahead for long raw reads"); 90 91 92 void 93 ffs_rawread_setup(void) 94 { 95 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 96 } 97 98 99 static int 100 ffs_rawread_sync(struct vnode *vp, struct thread *td) 101 { 102 int spl; 103 int error; 104 int upgraded; 105 106 GIANT_REQUIRED; 107 /* Check for dirty mmap, pending writes and dirty buffers */ 108 spl = splbio(); 109 VI_LOCK(vp); 110 if (vp->v_numoutput > 0 || 111 !TAILQ_EMPTY(&vp->v_dirtyblkhd) || 112 (vp->v_iflag & VI_OBJDIRTY) != 0) { 113 splx(spl); 114 VI_UNLOCK(vp); 115 116 if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) { 117 upgraded = 1; 118 /* Upgrade to exclusive lock, this might block */ 119 VOP_LOCK(vp, LK_UPGRADE | LK_NOPAUSE, td); 120 } else 121 upgraded = 0; 122 123 124 /* Attempt to msync mmap() regions to clean dirty mmap */ 125 VI_LOCK(vp); 126 if ((vp->v_iflag & VI_OBJDIRTY) != 0) { 127 struct vm_object *obj; 128 VI_UNLOCK(vp); 129 if (VOP_GETVOBJECT(vp, &obj) == 0) { 130 VM_OBJECT_LOCK(obj); 131 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 132 VM_OBJECT_UNLOCK(obj); 133 } 134 VI_LOCK(vp); 135 } 136 137 /* Wait for pending writes to complete */ 138 spl = splbio(); 139 while (vp->v_numoutput) { 140 vp->v_iflag |= VI_BWAIT; 141 error = msleep((caddr_t)&vp->v_numoutput, 142 VI_MTX(vp), 143 PRIBIO + 1, 144 "rawrdfls", 0); 145 if (error != 0) { 146 splx(spl); 147 VI_UNLOCK(vp); 148 if (upgraded != 0) 149 VOP_LOCK(vp, LK_DOWNGRADE, td); 150 return (error); 151 } 152 } 153 /* Flush dirty buffers */ 154 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 155 splx(spl); 156 VI_UNLOCK(vp); 157 if ((error = VOP_FSYNC(vp, NOCRED, MNT_WAIT, td)) != 0) { 158 if (upgraded != 0) 159 VOP_LOCK(vp, LK_DOWNGRADE, td); 160 return (error); 161 } 162 VI_LOCK(vp); 163 spl = splbio(); 164 if (vp->v_numoutput > 0 || 165 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 166 panic("ffs_rawread_sync: dirty bufs"); 167 } 168 splx(spl); 169 VI_UNLOCK(vp); 170 if (upgraded != 0) 171 VOP_LOCK(vp, LK_DOWNGRADE, td); 172 } else { 173 splx(spl); 174 VI_UNLOCK(vp); 175 } 176 return 0; 177 } 178 179 180 static int 181 ffs_rawread_readahead(struct vnode *vp, 182 caddr_t udata, 183 off_t offset, 184 size_t len, 185 struct thread *td, 186 struct buf *bp, 187 caddr_t sa) 188 { 189 int error; 190 u_int iolen; 191 off_t blockno; 192 int blockoff; 193 int bsize; 194 struct vnode *dp; 195 int bforwards; 196 struct inode *ip; 197 ufs2_daddr_t blkno; 198 199 GIANT_REQUIRED; 200 bsize = vp->v_mount->mnt_stat.f_iosize; 201 202 ip = VTOI(vp); 203 dp = ip->i_devvp; 204 205 iolen = ((vm_offset_t) udata) & PAGE_MASK; 206 bp->b_bcount = len; 207 if (bp->b_bcount + iolen > bp->b_kvasize) { 208 bp->b_bcount = bp->b_kvasize; 209 if (iolen != 0) 210 bp->b_bcount -= PAGE_SIZE; 211 } 212 bp->b_flags = 0; /* XXX necessary ? */ 213 bp->b_iocmd = BIO_READ; 214 bp->b_iodone = ffs_rawreadwakeup; 215 bp->b_data = udata; 216 bp->b_saveaddr = sa; 217 blockno = offset / bsize; 218 blockoff = (offset % bsize) / DEV_BSIZE; 219 if ((daddr_t) blockno != blockno) { 220 return EINVAL; /* blockno overflow */ 221 } 222 223 bp->b_lblkno = bp->b_blkno = blockno; 224 225 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 226 if (error != 0) 227 return error; 228 if (blkno == -1) { 229 230 /* Fill holes with NULs to preserve semantics */ 231 232 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 233 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 234 bp->b_bufsize = bp->b_bcount; 235 236 if (vmapbuf(bp) < 0) 237 return EFAULT; 238 239 if (ticks - PCPU_GET(switchticks) >= hogticks) 240 uio_yield(); 241 bzero(bp->b_data, bp->b_bufsize); 242 243 /* Mark operation completed (similar to bufdone()) */ 244 245 bp->b_resid = 0; 246 bp->b_flags |= B_DONE; 247 return 0; 248 } 249 bp->b_blkno = blkno + blockoff; 250 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 251 252 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 253 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 254 bp->b_bufsize = bp->b_bcount; 255 bp->b_dev = dp->v_rdev; 256 257 if (vmapbuf(bp) < 0) 258 return EFAULT; 259 260 if (dp->v_type == VCHR) 261 (void) VOP_SPECSTRATEGY(dp, bp); 262 else 263 (void) VOP_STRATEGY(dp, bp); 264 return 0; 265 } 266 267 268 static int 269 ffs_rawread_main(struct vnode *vp, 270 struct uio *uio) 271 { 272 int error, nerror; 273 struct buf *bp, *nbp, *tbp; 274 caddr_t sa, nsa, tsa; 275 u_int iolen; 276 int spl; 277 caddr_t udata; 278 long resid; 279 off_t offset; 280 struct thread *td; 281 282 GIANT_REQUIRED; 283 td = uio->uio_td ? uio->uio_td : curthread; 284 udata = uio->uio_iov->iov_base; 285 resid = uio->uio_resid; 286 offset = uio->uio_offset; 287 288 /* 289 * keep the process from being swapped 290 */ 291 PHOLD(td->td_proc); 292 293 error = 0; 294 nerror = 0; 295 296 bp = NULL; 297 nbp = NULL; 298 sa = NULL; 299 nsa = NULL; 300 301 while (resid > 0) { 302 303 if (bp == NULL) { /* Setup first read */ 304 /* XXX: Leave some bufs for swap */ 305 bp = getpbuf(&ffsrawbufcnt); 306 sa = bp->b_data; 307 bp->b_vp = vp; 308 error = ffs_rawread_readahead(vp, udata, offset, 309 resid, td, bp, sa); 310 if (error != 0) 311 break; 312 313 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 314 /* XXX: Leave bufs for swap */ 315 if (rawreadahead != 0) 316 nbp = trypbuf(&ffsrawbufcnt); 317 else 318 nbp = NULL; 319 if (nbp != NULL) { 320 nsa = nbp->b_data; 321 nbp->b_vp = vp; 322 323 nerror = ffs_rawread_readahead(vp, 324 udata + 325 bp->b_bufsize, 326 offset + 327 bp->b_bufsize, 328 resid - 329 bp->b_bufsize, 330 td, 331 nbp, 332 nsa); 333 if (nerror) { 334 relpbuf(nbp, &ffsrawbufcnt); 335 nbp = NULL; 336 } 337 } 338 } 339 } 340 341 spl = splbio(); 342 bwait(bp, PRIBIO, "rawrd"); 343 splx(spl); 344 345 vunmapbuf(bp); 346 347 iolen = bp->b_bcount - bp->b_resid; 348 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 349 nerror = 0; /* Ignore possible beyond EOF error */ 350 break; /* EOF */ 351 } 352 353 if ((bp->b_ioflags & BIO_ERROR) != 0) { 354 error = bp->b_error; 355 break; 356 } 357 resid -= iolen; 358 udata += iolen; 359 offset += iolen; 360 if (iolen < bp->b_bufsize) { 361 /* Incomplete read. Try to read remaining part */ 362 error = ffs_rawread_readahead(vp, 363 udata, 364 offset, 365 bp->b_bufsize - iolen, 366 td, 367 bp, 368 sa); 369 if (error != 0) 370 break; 371 } else if (nbp != NULL) { /* Complete read with readahead */ 372 373 tbp = bp; 374 bp = nbp; 375 nbp = tbp; 376 377 tsa = sa; 378 sa = nsa; 379 nsa = tsa; 380 381 if (resid <= bp->b_bufsize) { /* No more readaheads */ 382 relpbuf(nbp, &ffsrawbufcnt); 383 nbp = NULL; 384 } else { /* Setup next readahead */ 385 nerror = ffs_rawread_readahead(vp, 386 udata + 387 bp->b_bufsize, 388 offset + 389 bp->b_bufsize, 390 resid - 391 bp->b_bufsize, 392 td, 393 nbp, 394 nsa); 395 if (nerror != 0) { 396 relpbuf(nbp, &ffsrawbufcnt); 397 nbp = NULL; 398 } 399 } 400 } else if (nerror != 0) {/* Deferred Readahead error */ 401 break; 402 } else if (resid > 0) { /* More to read, no readahead */ 403 error = ffs_rawread_readahead(vp, udata, offset, 404 resid, td, bp, sa); 405 if (error != 0) 406 break; 407 } 408 } 409 410 if (bp != NULL) 411 relpbuf(bp, &ffsrawbufcnt); 412 if (nbp != NULL) { /* Run down readahead buffer */ 413 spl = splbio(); 414 bwait(nbp, PRIBIO, "rawrd"); 415 splx(spl); 416 vunmapbuf(nbp); 417 relpbuf(nbp, &ffsrawbufcnt); 418 } 419 420 if (error == 0) 421 error = nerror; 422 PRELE(td->td_proc); 423 uio->uio_iov->iov_base = udata; 424 uio->uio_resid = resid; 425 uio->uio_offset = offset; 426 return error; 427 } 428 429 430 int 431 ffs_rawread(struct vnode *vp, 432 struct uio *uio, 433 int *workdone) 434 { 435 if (allowrawread != 0 && 436 uio->uio_iovcnt == 1 && 437 uio->uio_segflg == UIO_USERSPACE && 438 uio->uio_resid == uio->uio_iov->iov_len && 439 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_flags & 440 TDF_DEADLKTREAT) == 0) { 441 int secsize; /* Media sector size */ 442 off_t filebytes; /* Bytes left of file */ 443 int blockbytes; /* Bytes left of file in full blocks */ 444 int partialbytes; /* Bytes in last partial block */ 445 int skipbytes; /* Bytes not to read in ffs_rawread */ 446 struct inode *ip; 447 int error; 448 449 450 /* Only handle sector aligned reads */ 451 ip = VTOI(vp); 452 secsize = ip->i_devvp->v_rdev->si_bsize_phys; 453 if ((uio->uio_offset & (secsize - 1)) == 0 && 454 (uio->uio_resid & (secsize - 1)) == 0) { 455 456 /* Sync dirty pages and buffers if needed */ 457 error = ffs_rawread_sync(vp, 458 (uio->uio_td != NULL) ? 459 uio->uio_td : curthread); 460 if (error != 0) 461 return error; 462 463 /* Check for end of file */ 464 if (ip->i_size > uio->uio_offset) { 465 filebytes = ip->i_size - uio->uio_offset; 466 467 /* No special eof handling needed ? */ 468 if (uio->uio_resid <= filebytes) { 469 *workdone = 1; 470 return ffs_rawread_main(vp, uio); 471 } 472 473 partialbytes = ((unsigned int) ip->i_size) % 474 ip->i_fs->fs_bsize; 475 blockbytes = (int) filebytes - partialbytes; 476 if (blockbytes > 0) { 477 skipbytes = uio->uio_resid - 478 blockbytes; 479 uio->uio_resid = blockbytes; 480 error = ffs_rawread_main(vp, uio); 481 uio->uio_resid += skipbytes; 482 if (error != 0) 483 return error; 484 /* Read remaining part using buffer */ 485 } 486 } 487 } 488 } 489 *workdone = 0; 490 return 0; 491 } 492 493 494 static void 495 ffs_rawreadwakeup(struct buf *bp) 496 { 497 bdone(bp); 498 } 499