1 /*- 2 * Copyright (c) 2000-2003 Tor Egge 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/fcntl.h> 33 #include <sys/file.h> 34 #include <sys/stat.h> 35 #include <sys/proc.h> 36 #include <sys/limits.h> 37 #include <sys/mount.h> 38 #include <sys/namei.h> 39 #include <sys/vnode.h> 40 #include <sys/conf.h> 41 #include <sys/filio.h> 42 #include <sys/ttycom.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <ufs/ufs/extattr.h> 46 #include <ufs/ufs/quota.h> 47 #include <ufs/ufs/inode.h> 48 #include <ufs/ufs/ufsmount.h> 49 #include <ufs/ufs/ufs_extern.h> 50 #include <ufs/ffs/fs.h> 51 #include <ufs/ffs/ffs_extern.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_extern.h> 55 #include <vm/vm_object.h> 56 #include <sys/kernel.h> 57 #include <sys/sysctl.h> 58 59 static int ffs_rawread_readahead(struct vnode *vp, 60 caddr_t udata, 61 off_t offset, 62 size_t len, 63 struct thread *td, 64 struct buf *bp, 65 caddr_t sa); 66 static int ffs_rawread_main(struct vnode *vp, 67 struct uio *uio); 68 69 static int ffs_rawread_sync(struct vnode *vp, struct thread *td); 70 71 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 72 73 void ffs_rawread_setup(void); 74 75 static void ffs_rawreadwakeup(struct buf *bp); 76 77 78 SYSCTL_DECL(_vfs_ffs); 79 80 static int ffsrawbufcnt = 4; 81 SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 82 "Buffers available for raw reads"); 83 84 static int allowrawread = 1; 85 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 86 "Flag to enable raw reads"); 87 88 static int rawreadahead = 1; 89 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 90 "Flag to enable readahead for long raw reads"); 91 92 93 void 94 ffs_rawread_setup(void) 95 { 96 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 97 } 98 99 100 static int 101 ffs_rawread_sync(struct vnode *vp, struct thread *td) 102 { 103 int spl; 104 int error; 105 int upgraded; 106 struct bufobj *bo; 107 108 GIANT_REQUIRED; 109 /* Check for dirty mmap, pending writes and dirty buffers */ 110 spl = splbio(); 111 VI_LOCK(vp); 112 bo = &vp->v_bufobj; 113 if (bo->bo_numoutput > 0 || 114 bo->bo_dirty.bv_cnt > 0 || 115 (vp->v_iflag & VI_OBJDIRTY) != 0) { 116 splx(spl); 117 VI_UNLOCK(vp); 118 119 if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) { 120 upgraded = 1; 121 /* Upgrade to exclusive lock, this might block */ 122 VOP_LOCK(vp, LK_UPGRADE | LK_NOPAUSE, td); 123 } else 124 upgraded = 0; 125 126 127 /* Attempt to msync mmap() regions to clean dirty mmap */ 128 VI_LOCK(vp); 129 if ((vp->v_iflag & VI_OBJDIRTY) != 0) { 130 VI_UNLOCK(vp); 131 if (vp->v_object != NULL) { 132 VM_OBJECT_LOCK(vp->v_object); 133 vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC); 134 VM_OBJECT_UNLOCK(vp->v_object); 135 } 136 VI_LOCK(vp); 137 } 138 139 /* Wait for pending writes to complete */ 140 spl = splbio(); 141 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 142 if (error != 0) { 143 /* XXX: can't happen with a zero timeout ??? */ 144 splx(spl); 145 VI_UNLOCK(vp); 146 if (upgraded != 0) 147 VOP_LOCK(vp, LK_DOWNGRADE, td); 148 return (error); 149 } 150 /* Flush dirty buffers */ 151 if (bo->bo_dirty.bv_cnt > 0) { 152 splx(spl); 153 VI_UNLOCK(vp); 154 if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) { 155 if (upgraded != 0) 156 VOP_LOCK(vp, LK_DOWNGRADE, td); 157 return (error); 158 } 159 VI_LOCK(vp); 160 spl = splbio(); 161 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 162 panic("ffs_rawread_sync: dirty bufs"); 163 } 164 splx(spl); 165 VI_UNLOCK(vp); 166 if (upgraded != 0) 167 VOP_LOCK(vp, LK_DOWNGRADE, td); 168 } else { 169 splx(spl); 170 VI_UNLOCK(vp); 171 } 172 return 0; 173 } 174 175 176 static int 177 ffs_rawread_readahead(struct vnode *vp, 178 caddr_t udata, 179 off_t offset, 180 size_t len, 181 struct thread *td, 182 struct buf *bp, 183 caddr_t sa) 184 { 185 int error; 186 u_int iolen; 187 off_t blockno; 188 int blockoff; 189 int bsize; 190 struct vnode *dp; 191 int bforwards; 192 struct inode *ip; 193 ufs2_daddr_t blkno; 194 195 GIANT_REQUIRED; 196 bsize = vp->v_mount->mnt_stat.f_iosize; 197 198 ip = VTOI(vp); 199 dp = ip->i_devvp; 200 201 iolen = ((vm_offset_t) udata) & PAGE_MASK; 202 bp->b_bcount = len; 203 if (bp->b_bcount + iolen > bp->b_kvasize) { 204 bp->b_bcount = bp->b_kvasize; 205 if (iolen != 0) 206 bp->b_bcount -= PAGE_SIZE; 207 } 208 bp->b_flags = 0; /* XXX necessary ? */ 209 bp->b_iocmd = BIO_READ; 210 bp->b_iodone = ffs_rawreadwakeup; 211 bp->b_data = udata; 212 bp->b_saveaddr = sa; 213 blockno = offset / bsize; 214 blockoff = (offset % bsize) / DEV_BSIZE; 215 if ((daddr_t) blockno != blockno) { 216 return EINVAL; /* blockno overflow */ 217 } 218 219 bp->b_lblkno = bp->b_blkno = blockno; 220 221 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 222 if (error != 0) 223 return error; 224 if (blkno == -1) { 225 226 /* Fill holes with NULs to preserve semantics */ 227 228 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 229 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 230 bp->b_bufsize = bp->b_bcount; 231 232 if (vmapbuf(bp) < 0) 233 return EFAULT; 234 235 if (ticks - PCPU_GET(switchticks) >= hogticks) 236 uio_yield(); 237 bzero(bp->b_data, bp->b_bufsize); 238 239 /* Mark operation completed (similar to bufdone()) */ 240 241 bp->b_resid = 0; 242 bp->b_flags |= B_DONE; 243 return 0; 244 } 245 bp->b_blkno = blkno + blockoff; 246 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 247 248 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 249 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 250 bp->b_bufsize = bp->b_bcount; 251 252 if (vmapbuf(bp) < 0) 253 return EFAULT; 254 255 BO_STRATEGY(&dp->v_bufobj, bp); 256 return 0; 257 } 258 259 260 static int 261 ffs_rawread_main(struct vnode *vp, 262 struct uio *uio) 263 { 264 int error, nerror; 265 struct buf *bp, *nbp, *tbp; 266 caddr_t sa, nsa, tsa; 267 u_int iolen; 268 int spl; 269 caddr_t udata; 270 long resid; 271 off_t offset; 272 struct thread *td; 273 274 GIANT_REQUIRED; 275 td = uio->uio_td ? uio->uio_td : curthread; 276 udata = uio->uio_iov->iov_base; 277 resid = uio->uio_resid; 278 offset = uio->uio_offset; 279 280 /* 281 * keep the process from being swapped 282 */ 283 PHOLD(td->td_proc); 284 285 error = 0; 286 nerror = 0; 287 288 bp = NULL; 289 nbp = NULL; 290 sa = NULL; 291 nsa = NULL; 292 293 while (resid > 0) { 294 295 if (bp == NULL) { /* Setup first read */ 296 /* XXX: Leave some bufs for swap */ 297 bp = getpbuf(&ffsrawbufcnt); 298 sa = bp->b_data; 299 bp->b_vp = vp; 300 error = ffs_rawread_readahead(vp, udata, offset, 301 resid, td, bp, sa); 302 if (error != 0) 303 break; 304 305 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 306 /* XXX: Leave bufs for swap */ 307 if (rawreadahead != 0) 308 nbp = trypbuf(&ffsrawbufcnt); 309 else 310 nbp = NULL; 311 if (nbp != NULL) { 312 nsa = nbp->b_data; 313 nbp->b_vp = vp; 314 315 nerror = ffs_rawread_readahead(vp, 316 udata + 317 bp->b_bufsize, 318 offset + 319 bp->b_bufsize, 320 resid - 321 bp->b_bufsize, 322 td, 323 nbp, 324 nsa); 325 if (nerror) { 326 relpbuf(nbp, &ffsrawbufcnt); 327 nbp = NULL; 328 } 329 } 330 } 331 } 332 333 spl = splbio(); 334 bwait(bp, PRIBIO, "rawrd"); 335 splx(spl); 336 337 vunmapbuf(bp); 338 339 iolen = bp->b_bcount - bp->b_resid; 340 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 341 nerror = 0; /* Ignore possible beyond EOF error */ 342 break; /* EOF */ 343 } 344 345 if ((bp->b_ioflags & BIO_ERROR) != 0) { 346 error = bp->b_error; 347 break; 348 } 349 resid -= iolen; 350 udata += iolen; 351 offset += iolen; 352 if (iolen < bp->b_bufsize) { 353 /* Incomplete read. Try to read remaining part */ 354 error = ffs_rawread_readahead(vp, 355 udata, 356 offset, 357 bp->b_bufsize - iolen, 358 td, 359 bp, 360 sa); 361 if (error != 0) 362 break; 363 } else if (nbp != NULL) { /* Complete read with readahead */ 364 365 tbp = bp; 366 bp = nbp; 367 nbp = tbp; 368 369 tsa = sa; 370 sa = nsa; 371 nsa = tsa; 372 373 if (resid <= bp->b_bufsize) { /* No more readaheads */ 374 relpbuf(nbp, &ffsrawbufcnt); 375 nbp = NULL; 376 } else { /* Setup next readahead */ 377 nerror = ffs_rawread_readahead(vp, 378 udata + 379 bp->b_bufsize, 380 offset + 381 bp->b_bufsize, 382 resid - 383 bp->b_bufsize, 384 td, 385 nbp, 386 nsa); 387 if (nerror != 0) { 388 relpbuf(nbp, &ffsrawbufcnt); 389 nbp = NULL; 390 } 391 } 392 } else if (nerror != 0) {/* Deferred Readahead error */ 393 break; 394 } else if (resid > 0) { /* More to read, no readahead */ 395 error = ffs_rawread_readahead(vp, udata, offset, 396 resid, td, bp, sa); 397 if (error != 0) 398 break; 399 } 400 } 401 402 if (bp != NULL) 403 relpbuf(bp, &ffsrawbufcnt); 404 if (nbp != NULL) { /* Run down readahead buffer */ 405 spl = splbio(); 406 bwait(nbp, PRIBIO, "rawrd"); 407 splx(spl); 408 vunmapbuf(nbp); 409 relpbuf(nbp, &ffsrawbufcnt); 410 } 411 412 if (error == 0) 413 error = nerror; 414 PRELE(td->td_proc); 415 uio->uio_iov->iov_base = udata; 416 uio->uio_resid = resid; 417 uio->uio_offset = offset; 418 return error; 419 } 420 421 422 int 423 ffs_rawread(struct vnode *vp, 424 struct uio *uio, 425 int *workdone) 426 { 427 if (allowrawread != 0 && 428 uio->uio_iovcnt == 1 && 429 uio->uio_segflg == UIO_USERSPACE && 430 uio->uio_resid == uio->uio_iov->iov_len && 431 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 432 TDP_DEADLKTREAT) == 0) { 433 int secsize; /* Media sector size */ 434 off_t filebytes; /* Bytes left of file */ 435 int blockbytes; /* Bytes left of file in full blocks */ 436 int partialbytes; /* Bytes in last partial block */ 437 int skipbytes; /* Bytes not to read in ffs_rawread */ 438 struct inode *ip; 439 int error; 440 441 442 /* Only handle sector aligned reads */ 443 ip = VTOI(vp); 444 secsize = ip->i_devvp->v_bufobj.bo_bsize; 445 if ((uio->uio_offset & (secsize - 1)) == 0 && 446 (uio->uio_resid & (secsize - 1)) == 0) { 447 448 /* Sync dirty pages and buffers if needed */ 449 error = ffs_rawread_sync(vp, 450 (uio->uio_td != NULL) ? 451 uio->uio_td : curthread); 452 if (error != 0) 453 return error; 454 455 /* Check for end of file */ 456 if (ip->i_size > uio->uio_offset) { 457 filebytes = ip->i_size - uio->uio_offset; 458 459 /* No special eof handling needed ? */ 460 if (uio->uio_resid <= filebytes) { 461 *workdone = 1; 462 return ffs_rawread_main(vp, uio); 463 } 464 465 partialbytes = ((unsigned int) ip->i_size) % 466 ip->i_fs->fs_bsize; 467 blockbytes = (int) filebytes - partialbytes; 468 if (blockbytes > 0) { 469 skipbytes = uio->uio_resid - 470 blockbytes; 471 uio->uio_resid = blockbytes; 472 error = ffs_rawread_main(vp, uio); 473 uio->uio_resid += skipbytes; 474 if (error != 0) 475 return error; 476 /* Read remaining part using buffer */ 477 } 478 } 479 } 480 } 481 *workdone = 0; 482 return 0; 483 } 484 485 486 static void 487 ffs_rawreadwakeup(struct buf *bp) 488 { 489 bdone(bp); 490 } 491