1 /*- 2 * Copyright (c) 2000-2003 Tor Egge 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/fcntl.h> 32 #include <sys/file.h> 33 #include <sys/stat.h> 34 #include <sys/proc.h> 35 #include <sys/mount.h> 36 #include <sys/namei.h> 37 #include <sys/vnode.h> 38 #include <sys/conf.h> 39 #include <sys/filio.h> 40 #include <sys/ttycom.h> 41 #include <sys/bio.h> 42 #include <sys/buf.h> 43 #include <ufs/ufs/quota.h> 44 #include <ufs/ufs/inode.h> 45 #include <ufs/ffs/fs.h> 46 47 #include <machine/limits.h> 48 #include <vm/vm.h> 49 #include <vm/vm_extern.h> 50 #include <vm/vm_object.h> 51 #include <sys/kernel.h> 52 #include <sys/sysctl.h> 53 54 static int ffs_rawread_readahead(struct vnode *vp, 55 caddr_t udata, 56 off_t offset, 57 size_t len, 58 struct thread *td, 59 struct buf *bp, 60 caddr_t sa); 61 static int ffs_rawread_main(struct vnode *vp, 62 struct uio *uio); 63 64 static int ffs_rawread_sync(struct vnode *vp, struct thread *td); 65 66 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 67 68 void ffs_rawread_setup(void); 69 70 static void ffs_rawreadwakeup(struct buf *bp); 71 72 73 SYSCTL_DECL(_vfs_ffs); 74 75 static int ffsrawbufcnt = 4; 76 SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 77 "Buffers available for raw reads"); 78 79 static int allowrawread = 1; 80 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 81 "Flag to enable raw reads"); 82 83 static int rawreadahead = 1; 84 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 85 "Flag to enable readahead for long raw reads"); 86 87 88 void 89 ffs_rawread_setup(void) 90 { 91 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 92 } 93 94 95 static int 96 ffs_rawread_sync(struct vnode *vp, struct thread *td) 97 { 98 int spl; 99 int error; 100 int upgraded; 101 102 GIANT_REQUIRED; 103 /* Check for dirty mmap, pending writes and dirty buffers */ 104 spl = splbio(); 105 VI_LOCK(vp); 106 if (vp->v_numoutput > 0 || 107 !TAILQ_EMPTY(&vp->v_dirtyblkhd) || 108 (vp->v_iflag & VI_OBJDIRTY) != 0) { 109 splx(spl); 110 VI_UNLOCK(vp); 111 112 if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) { 113 upgraded = 1; 114 /* Upgrade to exclusive lock, this might block */ 115 VOP_LOCK(vp, LK_UPGRADE | LK_NOPAUSE, td); 116 } else 117 upgraded = 0; 118 119 120 /* Attempt to msync mmap() regions to clean dirty mmap */ 121 VI_LOCK(vp); 122 if ((vp->v_iflag & VI_OBJDIRTY) != 0) { 123 struct vm_object *obj; 124 VI_UNLOCK(vp); 125 if (VOP_GETVOBJECT(vp, &obj) == 0) 126 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 127 VI_LOCK(vp); 128 } 129 130 /* Wait for pending writes to complete */ 131 spl = splbio(); 132 while (vp->v_numoutput) { 133 vp->v_iflag |= VI_BWAIT; 134 error = msleep((caddr_t)&vp->v_numoutput, 135 VI_MTX(vp), 136 PRIBIO + 1, 137 "rawrdfls", 0); 138 if (error != 0) { 139 splx(spl); 140 VI_UNLOCK(vp); 141 if (upgraded != 0) 142 VOP_LOCK(vp, LK_DOWNGRADE, td); 143 return (error); 144 } 145 } 146 /* Flush dirty buffers */ 147 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 148 splx(spl); 149 VI_UNLOCK(vp); 150 if ((error = VOP_FSYNC(vp, NOCRED, MNT_WAIT, td)) != 0) { 151 if (upgraded != 0) 152 VOP_LOCK(vp, LK_DOWNGRADE, td); 153 return (error); 154 } 155 VI_LOCK(vp); 156 spl = splbio(); 157 if (vp->v_numoutput > 0 || 158 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 159 panic("ffs_rawread_sync: dirty bufs"); 160 } 161 splx(spl); 162 VI_UNLOCK(vp); 163 if (upgraded != 0) 164 VOP_LOCK(vp, LK_DOWNGRADE, td); 165 } else { 166 splx(spl); 167 VI_UNLOCK(vp); 168 } 169 return 0; 170 } 171 172 173 static int 174 ffs_rawread_readahead(struct vnode *vp, 175 caddr_t udata, 176 off_t offset, 177 size_t len, 178 struct thread *td, 179 struct buf *bp, 180 caddr_t sa) 181 { 182 int error; 183 u_int iolen; 184 off_t blockno; 185 int blockoff; 186 int bsize; 187 struct vnode *dp; 188 int bforwards; 189 190 GIANT_REQUIRED; 191 bsize = vp->v_mount->mnt_stat.f_iosize; 192 193 iolen = ((vm_offset_t) udata) & PAGE_MASK; 194 bp->b_bcount = len; 195 if (bp->b_bcount + iolen > bp->b_kvasize) { 196 bp->b_bcount = bp->b_kvasize; 197 if (iolen != 0) 198 bp->b_bcount -= PAGE_SIZE; 199 } 200 bp->b_flags = B_PHYS; 201 bp->b_iocmd = BIO_READ; 202 bp->b_iodone = ffs_rawreadwakeup; 203 bp->b_data = udata; 204 bp->b_saveaddr = sa; 205 bp->b_offset = offset; 206 blockno = bp->b_offset / bsize; 207 blockoff = (bp->b_offset % bsize) / DEV_BSIZE; 208 if ((daddr_t) blockno != blockno) { 209 return EINVAL; /* blockno overflow */ 210 } 211 212 bp->b_lblkno = bp->b_blkno = blockno; 213 if (!useracc(bp->b_data, bp->b_bcount, VM_PROT_WRITE)) { 214 return EFAULT; 215 } 216 217 error = VOP_BMAP(vp, bp->b_lblkno, &dp, &bp->b_blkno, &bforwards, 218 NULL); 219 if (error != 0) { 220 return error; 221 } 222 if (bp->b_blkno == -1) { 223 224 /* Fill holes with NULs to preserve semantics */ 225 226 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 227 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 228 bp->b_bufsize = bp->b_bcount; 229 230 if (vmapbuf(bp) < 0) 231 return EFAULT; 232 233 if (ticks - PCPU_GET(switchticks) >= hogticks) 234 uio_yield(); 235 bzero(bp->b_data, bp->b_bufsize); 236 237 /* Mark operation completed (similar to bufdone()) */ 238 239 bp->b_resid = 0; 240 bp->b_flags |= B_DONE; 241 return 0; 242 } 243 244 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 245 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 246 bp->b_bufsize = bp->b_bcount; 247 bp->b_blkno += blockoff; 248 bp->b_dev = dp->v_rdev; 249 250 if (vmapbuf(bp) < 0) 251 return EFAULT; 252 253 if (dp->v_type == VCHR) 254 (void) VOP_SPECSTRATEGY(dp, bp); 255 else 256 (void) VOP_STRATEGY(dp, bp); 257 return 0; 258 } 259 260 261 static int 262 ffs_rawread_main(struct vnode *vp, 263 struct uio *uio) 264 { 265 int error, nerror; 266 struct buf *bp, *nbp, *tbp; 267 caddr_t sa, nsa, tsa; 268 u_int iolen; 269 int spl; 270 caddr_t udata; 271 long resid; 272 off_t offset; 273 struct thread *td; 274 275 GIANT_REQUIRED; 276 td = uio->uio_td ? uio->uio_td : curthread; 277 udata = uio->uio_iov->iov_base; 278 resid = uio->uio_resid; 279 offset = uio->uio_offset; 280 281 /* 282 * keep the process from being swapped 283 */ 284 PHOLD(td->td_proc); 285 286 error = 0; 287 nerror = 0; 288 289 bp = NULL; 290 nbp = NULL; 291 sa = NULL; 292 nsa = NULL; 293 294 while (resid > 0) { 295 296 if (bp == NULL) { /* Setup first read */ 297 /* XXX: Leave some bufs for swap */ 298 bp = getpbuf(&ffsrawbufcnt); 299 sa = bp->b_data; 300 bp->b_vp = vp; 301 bp->b_error = 0; 302 error = ffs_rawread_readahead(vp, udata, offset, 303 resid, td, bp, sa); 304 if (error != 0) 305 break; 306 307 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 308 /* XXX: Leave bufs for swap */ 309 if (rawreadahead != 0) 310 nbp = trypbuf(&ffsrawbufcnt); 311 else 312 nbp = NULL; 313 if (nbp != NULL) { 314 nsa = nbp->b_data; 315 nbp->b_vp = vp; 316 nbp->b_error = 0; 317 318 nerror = ffs_rawread_readahead(vp, 319 udata + 320 bp->b_bufsize, 321 offset + 322 bp->b_bufsize, 323 resid - 324 bp->b_bufsize, 325 td, 326 nbp, 327 nsa); 328 if (nerror) { 329 relpbuf(nbp, &ffsrawbufcnt); 330 nbp = NULL; 331 } 332 } 333 } 334 } 335 336 spl = splbio(); 337 bwait(bp, PRIBIO, "rawrd"); 338 splx(spl); 339 340 vunmapbuf(bp); 341 342 iolen = bp->b_bcount - bp->b_resid; 343 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 344 nerror = 0; /* Ignore possible beyond EOF error */ 345 break; /* EOF */ 346 } 347 348 if ((bp->b_ioflags & BIO_ERROR) != 0) { 349 error = bp->b_error; 350 break; 351 } 352 resid -= iolen; 353 udata += iolen; 354 offset += iolen; 355 if (iolen < bp->b_bufsize) { 356 /* Incomplete read. Try to read remaining part */ 357 error = ffs_rawread_readahead(vp, 358 udata, 359 offset, 360 bp->b_bufsize - iolen, 361 td, 362 bp, 363 sa); 364 if (error != 0) 365 break; 366 } else if (nbp != NULL) { /* Complete read with readahead */ 367 368 tbp = bp; 369 bp = nbp; 370 nbp = tbp; 371 372 tsa = sa; 373 sa = nsa; 374 nsa = tsa; 375 376 if (resid <= bp->b_bufsize) { /* No more readaheads */ 377 relpbuf(nbp, &ffsrawbufcnt); 378 nbp = NULL; 379 } else { /* Setup next readahead */ 380 nerror = ffs_rawread_readahead(vp, 381 udata + 382 bp->b_bufsize, 383 offset + 384 bp->b_bufsize, 385 resid - 386 bp->b_bufsize, 387 td, 388 nbp, 389 nsa); 390 if (nerror != 0) { 391 relpbuf(nbp, &ffsrawbufcnt); 392 nbp = NULL; 393 } 394 } 395 } else if (nerror != 0) {/* Deferred Readahead error */ 396 break; 397 } else if (resid > 0) { /* More to read, no readahead */ 398 error = ffs_rawread_readahead(vp, udata, offset, 399 resid, td, bp, sa); 400 if (error != 0) 401 break; 402 } 403 } 404 405 if (bp != NULL) 406 relpbuf(bp, &ffsrawbufcnt); 407 if (nbp != NULL) { /* Run down readahead buffer */ 408 spl = splbio(); 409 bwait(nbp, PRIBIO, "rawrd"); 410 splx(spl); 411 vunmapbuf(nbp); 412 relpbuf(nbp, &ffsrawbufcnt); 413 } 414 415 if (error == 0) 416 error = nerror; 417 PRELE(td->td_proc); 418 uio->uio_iov->iov_base = udata; 419 uio->uio_resid = resid; 420 uio->uio_offset = offset; 421 return error; 422 } 423 424 425 int 426 ffs_rawread(struct vnode *vp, 427 struct uio *uio, 428 int *workdone) 429 { 430 if (allowrawread != 0 && 431 uio->uio_iovcnt == 1 && 432 uio->uio_segflg == UIO_USERSPACE && 433 uio->uio_resid == uio->uio_iov->iov_len && 434 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_flags & 435 TDF_DEADLKTREAT) == 0) { 436 int secsize; /* Media sector size */ 437 off_t filebytes; /* Bytes left of file */ 438 int blockbytes; /* Bytes left of file in full blocks */ 439 int partialbytes; /* Bytes in last partial block */ 440 int skipbytes; /* Bytes not to read in ffs_rawread */ 441 struct inode *ip; 442 int error; 443 444 445 /* Only handle sector aligned reads */ 446 ip = VTOI(vp); 447 secsize = ip->i_devvp->v_rdev->si_bsize_phys; 448 if ((uio->uio_offset & (secsize - 1)) == 0 && 449 (uio->uio_resid & (secsize - 1)) == 0) { 450 451 /* Sync dirty pages and buffers if needed */ 452 error = ffs_rawread_sync(vp, 453 (uio->uio_td != NULL) ? 454 uio->uio_td : curthread); 455 if (error != 0) 456 return error; 457 458 /* Check for end of file */ 459 if (ip->i_size > uio->uio_offset) { 460 filebytes = ip->i_size - uio->uio_offset; 461 462 /* No special eof handling needed ? */ 463 if (uio->uio_resid <= filebytes) { 464 *workdone = 1; 465 return ffs_rawread_main(vp, uio); 466 } 467 468 partialbytes = ((unsigned int) ip->i_size) % 469 ip->i_fs->fs_bsize; 470 blockbytes = (int) filebytes - partialbytes; 471 if (blockbytes > 0) { 472 skipbytes = uio->uio_resid - 473 blockbytes; 474 uio->uio_resid = blockbytes; 475 error = ffs_rawread_main(vp, uio); 476 uio->uio_resid += skipbytes; 477 if (error != 0) 478 return error; 479 /* Read remaining part using buffer */ 480 } 481 } 482 } 483 } 484 *workdone = 0; 485 return 0; 486 } 487 488 489 static void 490 ffs_rawreadwakeup(struct buf *bp) 491 { 492 bdone(bp); 493 } 494