1 /*- 2 * Copyright (c) 2000-2003 Tor Egge 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/fcntl.h> 33 #include <sys/file.h> 34 #include <sys/stat.h> 35 #include <sys/proc.h> 36 #include <sys/limits.h> 37 #include <sys/mount.h> 38 #include <sys/namei.h> 39 #include <sys/vnode.h> 40 #include <sys/conf.h> 41 #include <sys/filio.h> 42 #include <sys/ttycom.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <ufs/ufs/extattr.h> 46 #include <ufs/ufs/quota.h> 47 #include <ufs/ufs/inode.h> 48 #include <ufs/ufs/ufsmount.h> 49 #include <ufs/ufs/ufs_extern.h> 50 #include <ufs/ffs/fs.h> 51 #include <ufs/ffs/ffs_extern.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_extern.h> 55 #include <vm/vm_object.h> 56 #include <sys/kernel.h> 57 #include <sys/sysctl.h> 58 59 static int ffs_rawread_readahead(struct vnode *vp, 60 caddr_t udata, 61 off_t offset, 62 size_t len, 63 struct thread *td, 64 struct buf *bp, 65 caddr_t sa); 66 static int ffs_rawread_main(struct vnode *vp, 67 struct uio *uio); 68 69 static int ffs_rawread_sync(struct vnode *vp, struct thread *td); 70 71 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 72 73 void ffs_rawread_setup(void); 74 75 SYSCTL_DECL(_vfs_ffs); 76 77 static int ffsrawbufcnt = 4; 78 SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 79 "Buffers available for raw reads"); 80 81 static int allowrawread = 1; 82 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 83 "Flag to enable raw reads"); 84 85 static int rawreadahead = 1; 86 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 87 "Flag to enable readahead for long raw reads"); 88 89 90 void 91 ffs_rawread_setup(void) 92 { 93 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 94 } 95 96 97 static int 98 ffs_rawread_sync(struct vnode *vp, struct thread *td) 99 { 100 int spl; 101 int error; 102 int upgraded; 103 struct bufobj *bo; 104 struct mount *mp; 105 106 /* Check for dirty mmap, pending writes and dirty buffers */ 107 spl = splbio(); 108 VI_LOCK(vp); 109 bo = &vp->v_bufobj; 110 if (bo->bo_numoutput > 0 || 111 bo->bo_dirty.bv_cnt > 0 || 112 (vp->v_iflag & VI_OBJDIRTY) != 0) { 113 splx(spl); 114 VI_UNLOCK(vp); 115 116 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 117 if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) 118 upgraded = 1; 119 else 120 upgraded = 0; 121 VOP_UNLOCK(vp, 0, td); 122 (void) vn_start_write(vp, &mp, V_WAIT); 123 VOP_LOCK(vp, LK_EXCLUSIVE, td); 124 } else if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) { 125 upgraded = 1; 126 /* Upgrade to exclusive lock, this might block */ 127 VOP_LOCK(vp, LK_UPGRADE, td); 128 } else 129 upgraded = 0; 130 131 132 VI_LOCK(vp); 133 /* Check if vnode was reclaimed while unlocked. */ 134 if ((vp->v_iflag & VI_DOOMED) != 0) { 135 VI_UNLOCK(vp); 136 if (upgraded != 0) 137 VOP_LOCK(vp, LK_DOWNGRADE, td); 138 vn_finished_write(mp); 139 return (EIO); 140 } 141 /* Attempt to msync mmap() regions to clean dirty mmap */ 142 if ((vp->v_iflag & VI_OBJDIRTY) != 0) { 143 VI_UNLOCK(vp); 144 if (vp->v_object != NULL) { 145 VM_OBJECT_LOCK(vp->v_object); 146 vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC); 147 VM_OBJECT_UNLOCK(vp->v_object); 148 } 149 VI_LOCK(vp); 150 } 151 152 /* Wait for pending writes to complete */ 153 spl = splbio(); 154 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 155 if (error != 0) { 156 /* XXX: can't happen with a zero timeout ??? */ 157 splx(spl); 158 VI_UNLOCK(vp); 159 if (upgraded != 0) 160 VOP_LOCK(vp, LK_DOWNGRADE, td); 161 vn_finished_write(mp); 162 return (error); 163 } 164 /* Flush dirty buffers */ 165 if (bo->bo_dirty.bv_cnt > 0) { 166 splx(spl); 167 VI_UNLOCK(vp); 168 if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) { 169 if (upgraded != 0) 170 VOP_LOCK(vp, LK_DOWNGRADE, td); 171 vn_finished_write(mp); 172 return (error); 173 } 174 VI_LOCK(vp); 175 spl = splbio(); 176 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 177 panic("ffs_rawread_sync: dirty bufs"); 178 } 179 splx(spl); 180 VI_UNLOCK(vp); 181 if (upgraded != 0) 182 VOP_LOCK(vp, LK_DOWNGRADE, td); 183 vn_finished_write(mp); 184 } else { 185 splx(spl); 186 VI_UNLOCK(vp); 187 } 188 return 0; 189 } 190 191 192 static int 193 ffs_rawread_readahead(struct vnode *vp, 194 caddr_t udata, 195 off_t offset, 196 size_t len, 197 struct thread *td, 198 struct buf *bp, 199 caddr_t sa) 200 { 201 int error; 202 u_int iolen; 203 off_t blockno; 204 int blockoff; 205 int bsize; 206 struct vnode *dp; 207 int bforwards; 208 struct inode *ip; 209 ufs2_daddr_t blkno; 210 211 bsize = vp->v_mount->mnt_stat.f_iosize; 212 213 ip = VTOI(vp); 214 dp = ip->i_devvp; 215 216 iolen = ((vm_offset_t) udata) & PAGE_MASK; 217 bp->b_bcount = len; 218 if (bp->b_bcount + iolen > bp->b_kvasize) { 219 bp->b_bcount = bp->b_kvasize; 220 if (iolen != 0) 221 bp->b_bcount -= PAGE_SIZE; 222 } 223 bp->b_flags = 0; /* XXX necessary ? */ 224 bp->b_iocmd = BIO_READ; 225 bp->b_iodone = bdone; 226 bp->b_data = udata; 227 bp->b_saveaddr = sa; 228 blockno = offset / bsize; 229 blockoff = (offset % bsize) / DEV_BSIZE; 230 if ((daddr_t) blockno != blockno) { 231 return EINVAL; /* blockno overflow */ 232 } 233 234 bp->b_lblkno = bp->b_blkno = blockno; 235 236 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 237 if (error != 0) 238 return error; 239 if (blkno == -1) { 240 241 /* Fill holes with NULs to preserve semantics */ 242 243 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 244 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 245 bp->b_bufsize = bp->b_bcount; 246 247 if (vmapbuf(bp) < 0) 248 return EFAULT; 249 250 if (ticks - PCPU_GET(switchticks) >= hogticks) 251 uio_yield(); 252 bzero(bp->b_data, bp->b_bufsize); 253 254 /* Mark operation completed (similar to bufdone()) */ 255 256 bp->b_resid = 0; 257 bp->b_flags |= B_DONE; 258 return 0; 259 } 260 bp->b_blkno = blkno + blockoff; 261 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 262 263 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 264 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 265 bp->b_bufsize = bp->b_bcount; 266 267 if (vmapbuf(bp) < 0) 268 return EFAULT; 269 270 BO_STRATEGY(&dp->v_bufobj, bp); 271 return 0; 272 } 273 274 275 static int 276 ffs_rawread_main(struct vnode *vp, 277 struct uio *uio) 278 { 279 int error, nerror; 280 struct buf *bp, *nbp, *tbp; 281 caddr_t sa, nsa, tsa; 282 u_int iolen; 283 int spl; 284 caddr_t udata; 285 long resid; 286 off_t offset; 287 struct thread *td; 288 289 td = uio->uio_td ? uio->uio_td : curthread; 290 udata = uio->uio_iov->iov_base; 291 resid = uio->uio_resid; 292 offset = uio->uio_offset; 293 294 /* 295 * keep the process from being swapped 296 */ 297 PHOLD(td->td_proc); 298 299 error = 0; 300 nerror = 0; 301 302 bp = NULL; 303 nbp = NULL; 304 sa = NULL; 305 nsa = NULL; 306 307 while (resid > 0) { 308 309 if (bp == NULL) { /* Setup first read */ 310 /* XXX: Leave some bufs for swap */ 311 bp = getpbuf(&ffsrawbufcnt); 312 sa = bp->b_data; 313 pbgetvp(vp, bp); 314 error = ffs_rawread_readahead(vp, udata, offset, 315 resid, td, bp, sa); 316 if (error != 0) 317 break; 318 319 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 320 /* XXX: Leave bufs for swap */ 321 if (rawreadahead != 0) 322 nbp = trypbuf(&ffsrawbufcnt); 323 else 324 nbp = NULL; 325 if (nbp != NULL) { 326 nsa = nbp->b_data; 327 pbgetvp(vp, nbp); 328 329 nerror = ffs_rawread_readahead(vp, 330 udata + 331 bp->b_bufsize, 332 offset + 333 bp->b_bufsize, 334 resid - 335 bp->b_bufsize, 336 td, 337 nbp, 338 nsa); 339 if (nerror) { 340 pbrelvp(nbp); 341 relpbuf(nbp, &ffsrawbufcnt); 342 nbp = NULL; 343 } 344 } 345 } 346 } 347 348 spl = splbio(); 349 bwait(bp, PRIBIO, "rawrd"); 350 splx(spl); 351 352 vunmapbuf(bp); 353 354 iolen = bp->b_bcount - bp->b_resid; 355 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 356 nerror = 0; /* Ignore possible beyond EOF error */ 357 break; /* EOF */ 358 } 359 360 if ((bp->b_ioflags & BIO_ERROR) != 0) { 361 error = bp->b_error; 362 break; 363 } 364 resid -= iolen; 365 udata += iolen; 366 offset += iolen; 367 if (iolen < bp->b_bufsize) { 368 /* Incomplete read. Try to read remaining part */ 369 error = ffs_rawread_readahead(vp, 370 udata, 371 offset, 372 bp->b_bufsize - iolen, 373 td, 374 bp, 375 sa); 376 if (error != 0) 377 break; 378 } else if (nbp != NULL) { /* Complete read with readahead */ 379 380 tbp = bp; 381 bp = nbp; 382 nbp = tbp; 383 384 tsa = sa; 385 sa = nsa; 386 nsa = tsa; 387 388 if (resid <= bp->b_bufsize) { /* No more readaheads */ 389 pbrelvp(nbp); 390 relpbuf(nbp, &ffsrawbufcnt); 391 nbp = NULL; 392 } else { /* Setup next readahead */ 393 nerror = ffs_rawread_readahead(vp, 394 udata + 395 bp->b_bufsize, 396 offset + 397 bp->b_bufsize, 398 resid - 399 bp->b_bufsize, 400 td, 401 nbp, 402 nsa); 403 if (nerror != 0) { 404 pbrelvp(nbp); 405 relpbuf(nbp, &ffsrawbufcnt); 406 nbp = NULL; 407 } 408 } 409 } else if (nerror != 0) {/* Deferred Readahead error */ 410 break; 411 } else if (resid > 0) { /* More to read, no readahead */ 412 error = ffs_rawread_readahead(vp, udata, offset, 413 resid, td, bp, sa); 414 if (error != 0) 415 break; 416 } 417 } 418 419 if (bp != NULL) { 420 pbrelvp(bp); 421 relpbuf(bp, &ffsrawbufcnt); 422 } 423 if (nbp != NULL) { /* Run down readahead buffer */ 424 spl = splbio(); 425 bwait(nbp, PRIBIO, "rawrd"); 426 splx(spl); 427 vunmapbuf(nbp); 428 pbrelvp(nbp); 429 relpbuf(nbp, &ffsrawbufcnt); 430 } 431 432 if (error == 0) 433 error = nerror; 434 PRELE(td->td_proc); 435 uio->uio_iov->iov_base = udata; 436 uio->uio_resid = resid; 437 uio->uio_offset = offset; 438 return error; 439 } 440 441 442 int 443 ffs_rawread(struct vnode *vp, 444 struct uio *uio, 445 int *workdone) 446 { 447 if (allowrawread != 0 && 448 uio->uio_iovcnt == 1 && 449 uio->uio_segflg == UIO_USERSPACE && 450 uio->uio_resid == uio->uio_iov->iov_len && 451 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 452 TDP_DEADLKTREAT) == 0) { 453 int secsize; /* Media sector size */ 454 off_t filebytes; /* Bytes left of file */ 455 int blockbytes; /* Bytes left of file in full blocks */ 456 int partialbytes; /* Bytes in last partial block */ 457 int skipbytes; /* Bytes not to read in ffs_rawread */ 458 struct inode *ip; 459 int error; 460 461 462 /* Only handle sector aligned reads */ 463 ip = VTOI(vp); 464 secsize = ip->i_devvp->v_bufobj.bo_bsize; 465 if ((uio->uio_offset & (secsize - 1)) == 0 && 466 (uio->uio_resid & (secsize - 1)) == 0) { 467 468 /* Sync dirty pages and buffers if needed */ 469 error = ffs_rawread_sync(vp, 470 (uio->uio_td != NULL) ? 471 uio->uio_td : curthread); 472 if (error != 0) 473 return error; 474 475 /* Check for end of file */ 476 if (ip->i_size > uio->uio_offset) { 477 filebytes = ip->i_size - uio->uio_offset; 478 479 /* No special eof handling needed ? */ 480 if (uio->uio_resid <= filebytes) { 481 *workdone = 1; 482 return ffs_rawread_main(vp, uio); 483 } 484 485 partialbytes = ((unsigned int) ip->i_size) % 486 ip->i_fs->fs_bsize; 487 blockbytes = (int) filebytes - partialbytes; 488 if (blockbytes > 0) { 489 skipbytes = uio->uio_resid - 490 blockbytes; 491 uio->uio_resid = blockbytes; 492 error = ffs_rawread_main(vp, uio); 493 uio->uio_resid += skipbytes; 494 if (error != 0) 495 return error; 496 /* Read remaining part using buffer */ 497 } 498 } 499 } 500 } 501 *workdone = 0; 502 return 0; 503 } 504