1 /*- 2 * Copyright (c) 2000-2003 Tor Egge 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/fcntl.h> 33 #include <sys/file.h> 34 #include <sys/stat.h> 35 #include <sys/proc.h> 36 #include <sys/limits.h> 37 #include <sys/mount.h> 38 #include <sys/namei.h> 39 #include <sys/vnode.h> 40 #include <sys/conf.h> 41 #include <sys/filio.h> 42 #include <sys/ttycom.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <ufs/ufs/extattr.h> 46 #include <ufs/ufs/quota.h> 47 #include <ufs/ufs/inode.h> 48 #include <ufs/ufs/ufsmount.h> 49 #include <ufs/ufs/ufs_extern.h> 50 #include <ufs/ffs/fs.h> 51 #include <ufs/ffs/ffs_extern.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_extern.h> 55 #include <vm/vm_object.h> 56 #include <sys/kernel.h> 57 #include <sys/sysctl.h> 58 59 static int ffs_rawread_readahead(struct vnode *vp, 60 caddr_t udata, 61 off_t offset, 62 size_t len, 63 struct thread *td, 64 struct buf *bp, 65 caddr_t sa); 66 static int ffs_rawread_main(struct vnode *vp, 67 struct uio *uio); 68 69 static int ffs_rawread_sync(struct vnode *vp, struct thread *td); 70 71 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 72 73 void ffs_rawread_setup(void); 74 75 SYSCTL_DECL(_vfs_ffs); 76 77 static int ffsrawbufcnt = 4; 78 SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 79 "Buffers available for raw reads"); 80 81 static int allowrawread = 1; 82 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 83 "Flag to enable raw reads"); 84 85 static int rawreadahead = 1; 86 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 87 "Flag to enable readahead for long raw reads"); 88 89 90 void 91 ffs_rawread_setup(void) 92 { 93 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 94 } 95 96 97 static int 98 ffs_rawread_sync(struct vnode *vp, struct thread *td) 99 { 100 int spl; 101 int error; 102 int upgraded; 103 struct bufobj *bo; 104 105 /* Check for dirty mmap, pending writes and dirty buffers */ 106 spl = splbio(); 107 VI_LOCK(vp); 108 bo = &vp->v_bufobj; 109 if (bo->bo_numoutput > 0 || 110 bo->bo_dirty.bv_cnt > 0 || 111 (vp->v_iflag & VI_OBJDIRTY) != 0) { 112 splx(spl); 113 VI_UNLOCK(vp); 114 115 if (VOP_ISLOCKED(vp, td) != LK_EXCLUSIVE) { 116 upgraded = 1; 117 /* Upgrade to exclusive lock, this might block */ 118 VOP_LOCK(vp, LK_UPGRADE, td); 119 } else 120 upgraded = 0; 121 122 123 /* Attempt to msync mmap() regions to clean dirty mmap */ 124 VI_LOCK(vp); 125 if ((vp->v_iflag & VI_OBJDIRTY) != 0) { 126 VI_UNLOCK(vp); 127 if (vp->v_object != NULL) { 128 VM_OBJECT_LOCK(vp->v_object); 129 vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC); 130 VM_OBJECT_UNLOCK(vp->v_object); 131 } 132 VI_LOCK(vp); 133 } 134 135 /* Wait for pending writes to complete */ 136 spl = splbio(); 137 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 138 if (error != 0) { 139 /* XXX: can't happen with a zero timeout ??? */ 140 splx(spl); 141 VI_UNLOCK(vp); 142 if (upgraded != 0) 143 VOP_LOCK(vp, LK_DOWNGRADE, td); 144 return (error); 145 } 146 /* Flush dirty buffers */ 147 if (bo->bo_dirty.bv_cnt > 0) { 148 splx(spl); 149 VI_UNLOCK(vp); 150 if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) { 151 if (upgraded != 0) 152 VOP_LOCK(vp, LK_DOWNGRADE, td); 153 return (error); 154 } 155 VI_LOCK(vp); 156 spl = splbio(); 157 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 158 panic("ffs_rawread_sync: dirty bufs"); 159 } 160 splx(spl); 161 VI_UNLOCK(vp); 162 if (upgraded != 0) 163 VOP_LOCK(vp, LK_DOWNGRADE, td); 164 } else { 165 splx(spl); 166 VI_UNLOCK(vp); 167 } 168 return 0; 169 } 170 171 172 static int 173 ffs_rawread_readahead(struct vnode *vp, 174 caddr_t udata, 175 off_t offset, 176 size_t len, 177 struct thread *td, 178 struct buf *bp, 179 caddr_t sa) 180 { 181 int error; 182 u_int iolen; 183 off_t blockno; 184 int blockoff; 185 int bsize; 186 struct vnode *dp; 187 int bforwards; 188 struct inode *ip; 189 ufs2_daddr_t blkno; 190 191 bsize = vp->v_mount->mnt_stat.f_iosize; 192 193 ip = VTOI(vp); 194 dp = ip->i_devvp; 195 196 iolen = ((vm_offset_t) udata) & PAGE_MASK; 197 bp->b_bcount = len; 198 if (bp->b_bcount + iolen > bp->b_kvasize) { 199 bp->b_bcount = bp->b_kvasize; 200 if (iolen != 0) 201 bp->b_bcount -= PAGE_SIZE; 202 } 203 bp->b_flags = 0; /* XXX necessary ? */ 204 bp->b_iocmd = BIO_READ; 205 bp->b_iodone = bdone; 206 bp->b_data = udata; 207 bp->b_saveaddr = sa; 208 blockno = offset / bsize; 209 blockoff = (offset % bsize) / DEV_BSIZE; 210 if ((daddr_t) blockno != blockno) { 211 return EINVAL; /* blockno overflow */ 212 } 213 214 bp->b_lblkno = bp->b_blkno = blockno; 215 216 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 217 if (error != 0) 218 return error; 219 if (blkno == -1) { 220 221 /* Fill holes with NULs to preserve semantics */ 222 223 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 224 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 225 bp->b_bufsize = bp->b_bcount; 226 227 if (vmapbuf(bp) < 0) 228 return EFAULT; 229 230 if (ticks - PCPU_GET(switchticks) >= hogticks) 231 uio_yield(); 232 bzero(bp->b_data, bp->b_bufsize); 233 234 /* Mark operation completed (similar to bufdone()) */ 235 236 bp->b_resid = 0; 237 bp->b_flags |= B_DONE; 238 return 0; 239 } 240 bp->b_blkno = blkno + blockoff; 241 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 242 243 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 244 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 245 bp->b_bufsize = bp->b_bcount; 246 247 if (vmapbuf(bp) < 0) 248 return EFAULT; 249 250 BO_STRATEGY(&dp->v_bufobj, bp); 251 return 0; 252 } 253 254 255 static int 256 ffs_rawread_main(struct vnode *vp, 257 struct uio *uio) 258 { 259 int error, nerror; 260 struct buf *bp, *nbp, *tbp; 261 caddr_t sa, nsa, tsa; 262 u_int iolen; 263 int spl; 264 caddr_t udata; 265 long resid; 266 off_t offset; 267 struct thread *td; 268 269 td = uio->uio_td ? uio->uio_td : curthread; 270 udata = uio->uio_iov->iov_base; 271 resid = uio->uio_resid; 272 offset = uio->uio_offset; 273 274 /* 275 * keep the process from being swapped 276 */ 277 PHOLD(td->td_proc); 278 279 error = 0; 280 nerror = 0; 281 282 bp = NULL; 283 nbp = NULL; 284 sa = NULL; 285 nsa = NULL; 286 287 while (resid > 0) { 288 289 if (bp == NULL) { /* Setup first read */ 290 /* XXX: Leave some bufs for swap */ 291 bp = getpbuf(&ffsrawbufcnt); 292 sa = bp->b_data; 293 bp->b_vp = vp; 294 error = ffs_rawread_readahead(vp, udata, offset, 295 resid, td, bp, sa); 296 if (error != 0) 297 break; 298 299 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 300 /* XXX: Leave bufs for swap */ 301 if (rawreadahead != 0) 302 nbp = trypbuf(&ffsrawbufcnt); 303 else 304 nbp = NULL; 305 if (nbp != NULL) { 306 nsa = nbp->b_data; 307 nbp->b_vp = vp; 308 309 nerror = ffs_rawread_readahead(vp, 310 udata + 311 bp->b_bufsize, 312 offset + 313 bp->b_bufsize, 314 resid - 315 bp->b_bufsize, 316 td, 317 nbp, 318 nsa); 319 if (nerror) { 320 relpbuf(nbp, &ffsrawbufcnt); 321 nbp = NULL; 322 } 323 } 324 } 325 } 326 327 spl = splbio(); 328 bwait(bp, PRIBIO, "rawrd"); 329 splx(spl); 330 331 vunmapbuf(bp); 332 333 iolen = bp->b_bcount - bp->b_resid; 334 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 335 nerror = 0; /* Ignore possible beyond EOF error */ 336 break; /* EOF */ 337 } 338 339 if ((bp->b_ioflags & BIO_ERROR) != 0) { 340 error = bp->b_error; 341 break; 342 } 343 resid -= iolen; 344 udata += iolen; 345 offset += iolen; 346 if (iolen < bp->b_bufsize) { 347 /* Incomplete read. Try to read remaining part */ 348 error = ffs_rawread_readahead(vp, 349 udata, 350 offset, 351 bp->b_bufsize - iolen, 352 td, 353 bp, 354 sa); 355 if (error != 0) 356 break; 357 } else if (nbp != NULL) { /* Complete read with readahead */ 358 359 tbp = bp; 360 bp = nbp; 361 nbp = tbp; 362 363 tsa = sa; 364 sa = nsa; 365 nsa = tsa; 366 367 if (resid <= bp->b_bufsize) { /* No more readaheads */ 368 relpbuf(nbp, &ffsrawbufcnt); 369 nbp = NULL; 370 } else { /* Setup next readahead */ 371 nerror = ffs_rawread_readahead(vp, 372 udata + 373 bp->b_bufsize, 374 offset + 375 bp->b_bufsize, 376 resid - 377 bp->b_bufsize, 378 td, 379 nbp, 380 nsa); 381 if (nerror != 0) { 382 relpbuf(nbp, &ffsrawbufcnt); 383 nbp = NULL; 384 } 385 } 386 } else if (nerror != 0) {/* Deferred Readahead error */ 387 break; 388 } else if (resid > 0) { /* More to read, no readahead */ 389 error = ffs_rawread_readahead(vp, udata, offset, 390 resid, td, bp, sa); 391 if (error != 0) 392 break; 393 } 394 } 395 396 if (bp != NULL) 397 relpbuf(bp, &ffsrawbufcnt); 398 if (nbp != NULL) { /* Run down readahead buffer */ 399 spl = splbio(); 400 bwait(nbp, PRIBIO, "rawrd"); 401 splx(spl); 402 vunmapbuf(nbp); 403 relpbuf(nbp, &ffsrawbufcnt); 404 } 405 406 if (error == 0) 407 error = nerror; 408 PRELE(td->td_proc); 409 uio->uio_iov->iov_base = udata; 410 uio->uio_resid = resid; 411 uio->uio_offset = offset; 412 return error; 413 } 414 415 416 int 417 ffs_rawread(struct vnode *vp, 418 struct uio *uio, 419 int *workdone) 420 { 421 if (allowrawread != 0 && 422 uio->uio_iovcnt == 1 && 423 uio->uio_segflg == UIO_USERSPACE && 424 uio->uio_resid == uio->uio_iov->iov_len && 425 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 426 TDP_DEADLKTREAT) == 0) { 427 int secsize; /* Media sector size */ 428 off_t filebytes; /* Bytes left of file */ 429 int blockbytes; /* Bytes left of file in full blocks */ 430 int partialbytes; /* Bytes in last partial block */ 431 int skipbytes; /* Bytes not to read in ffs_rawread */ 432 struct inode *ip; 433 int error; 434 435 436 /* Only handle sector aligned reads */ 437 ip = VTOI(vp); 438 secsize = ip->i_devvp->v_bufobj.bo_bsize; 439 if ((uio->uio_offset & (secsize - 1)) == 0 && 440 (uio->uio_resid & (secsize - 1)) == 0) { 441 442 /* Sync dirty pages and buffers if needed */ 443 error = ffs_rawread_sync(vp, 444 (uio->uio_td != NULL) ? 445 uio->uio_td : curthread); 446 if (error != 0) 447 return error; 448 449 /* Check for end of file */ 450 if (ip->i_size > uio->uio_offset) { 451 filebytes = ip->i_size - uio->uio_offset; 452 453 /* No special eof handling needed ? */ 454 if (uio->uio_resid <= filebytes) { 455 *workdone = 1; 456 return ffs_rawread_main(vp, uio); 457 } 458 459 partialbytes = ((unsigned int) ip->i_size) % 460 ip->i_fs->fs_bsize; 461 blockbytes = (int) filebytes - partialbytes; 462 if (blockbytes > 0) { 463 skipbytes = uio->uio_resid - 464 blockbytes; 465 uio->uio_resid = blockbytes; 466 error = ffs_rawread_main(vp, uio); 467 uio->uio_resid += skipbytes; 468 if (error != 0) 469 return error; 470 /* Read remaining part using buffer */ 471 } 472 } 473 } 474 } 475 *workdone = 0; 476 return 0; 477 } 478