1 /*- 2 * Copyright (c) 2000-2003 Tor Egge 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/fcntl.h> 33 #include <sys/file.h> 34 #include <sys/stat.h> 35 #include <sys/proc.h> 36 #include <sys/limits.h> 37 #include <sys/mount.h> 38 #include <sys/namei.h> 39 #include <sys/vnode.h> 40 #include <sys/conf.h> 41 #include <sys/filio.h> 42 #include <sys/ttycom.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <sys/rwlock.h> 46 #include <ufs/ufs/extattr.h> 47 #include <ufs/ufs/quota.h> 48 #include <ufs/ufs/inode.h> 49 #include <ufs/ufs/ufsmount.h> 50 #include <ufs/ufs/ufs_extern.h> 51 #include <ufs/ffs/fs.h> 52 #include <ufs/ffs/ffs_extern.h> 53 54 #include <vm/vm.h> 55 #include <vm/vm_extern.h> 56 #include <vm/vm_object.h> 57 #include <sys/kernel.h> 58 #include <sys/sysctl.h> 59 60 static int ffs_rawread_readahead(struct vnode *vp, 61 caddr_t udata, 62 off_t offset, 63 size_t len, 64 struct thread *td, 65 struct buf *bp, 66 caddr_t sa); 67 static int ffs_rawread_main(struct vnode *vp, 68 struct uio *uio); 69 70 static int ffs_rawread_sync(struct vnode *vp); 71 72 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 73 74 SYSCTL_DECL(_vfs_ffs); 75 76 static int ffsrawbufcnt = 4; 77 SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 78 "Buffers available for raw reads"); 79 80 static int allowrawread = 1; 81 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 82 "Flag to enable raw reads"); 83 84 static int rawreadahead = 1; 85 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 86 "Flag to enable readahead for long raw reads"); 87 88 static void 89 ffs_rawread_setup(void *arg __unused) 90 { 91 92 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 93 } 94 SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); 95 96 static int 97 ffs_rawread_sync(struct vnode *vp) 98 { 99 int error; 100 int upgraded; 101 struct bufobj *bo; 102 struct mount *mp; 103 vm_object_t obj; 104 105 /* Check for dirty mmap, pending writes and dirty buffers */ 106 bo = &vp->v_bufobj; 107 BO_LOCK(bo); 108 VI_LOCK(vp); 109 if (bo->bo_numoutput > 0 || 110 bo->bo_dirty.bv_cnt > 0 || 111 ((obj = vp->v_object) != NULL && 112 (obj->flags & OBJ_MIGHTBEDIRTY) != 0)) { 113 VI_UNLOCK(vp); 114 BO_UNLOCK(bo); 115 116 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 117 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 118 upgraded = 1; 119 else 120 upgraded = 0; 121 VOP_UNLOCK(vp, 0); 122 (void) vn_start_write(vp, &mp, V_WAIT); 123 VOP_LOCK(vp, LK_EXCLUSIVE); 124 } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 125 upgraded = 1; 126 /* Upgrade to exclusive lock, this might block */ 127 VOP_LOCK(vp, LK_UPGRADE); 128 } else 129 upgraded = 0; 130 131 132 VI_LOCK(vp); 133 /* Check if vnode was reclaimed while unlocked. */ 134 if ((vp->v_iflag & VI_DOOMED) != 0) { 135 VI_UNLOCK(vp); 136 if (upgraded != 0) 137 VOP_LOCK(vp, LK_DOWNGRADE); 138 vn_finished_write(mp); 139 return (EIO); 140 } 141 /* Attempt to msync mmap() regions to clean dirty mmap */ 142 if ((obj = vp->v_object) != NULL && 143 (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 144 VI_UNLOCK(vp); 145 VM_OBJECT_WLOCK(obj); 146 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 147 VM_OBJECT_WUNLOCK(obj); 148 } else 149 VI_UNLOCK(vp); 150 151 /* Wait for pending writes to complete */ 152 BO_LOCK(bo); 153 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 154 if (error != 0) { 155 /* XXX: can't happen with a zero timeout ??? */ 156 BO_UNLOCK(bo); 157 if (upgraded != 0) 158 VOP_LOCK(vp, LK_DOWNGRADE); 159 vn_finished_write(mp); 160 return (error); 161 } 162 /* Flush dirty buffers */ 163 if (bo->bo_dirty.bv_cnt > 0) { 164 BO_UNLOCK(bo); 165 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { 166 if (upgraded != 0) 167 VOP_LOCK(vp, LK_DOWNGRADE); 168 vn_finished_write(mp); 169 return (error); 170 } 171 BO_LOCK(bo); 172 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 173 panic("ffs_rawread_sync: dirty bufs"); 174 } 175 BO_UNLOCK(bo); 176 if (upgraded != 0) 177 VOP_LOCK(vp, LK_DOWNGRADE); 178 vn_finished_write(mp); 179 } else { 180 VI_UNLOCK(vp); 181 BO_UNLOCK(bo); 182 } 183 return 0; 184 } 185 186 187 static int 188 ffs_rawread_readahead(struct vnode *vp, 189 caddr_t udata, 190 off_t offset, 191 size_t len, 192 struct thread *td, 193 struct buf *bp, 194 caddr_t sa) 195 { 196 int error; 197 u_int iolen; 198 off_t blockno; 199 int blockoff; 200 int bsize; 201 struct vnode *dp; 202 int bforwards; 203 struct inode *ip; 204 ufs2_daddr_t blkno; 205 206 bsize = vp->v_mount->mnt_stat.f_iosize; 207 208 ip = VTOI(vp); 209 dp = ip->i_devvp; 210 211 iolen = ((vm_offset_t) udata) & PAGE_MASK; 212 bp->b_bcount = len; 213 if (bp->b_bcount + iolen > bp->b_kvasize) { 214 bp->b_bcount = bp->b_kvasize; 215 if (iolen != 0) 216 bp->b_bcount -= PAGE_SIZE; 217 } 218 bp->b_flags = 0; /* XXX necessary ? */ 219 bp->b_iocmd = BIO_READ; 220 bp->b_iodone = bdone; 221 bp->b_data = udata; 222 bp->b_saveaddr = sa; 223 blockno = offset / bsize; 224 blockoff = (offset % bsize) / DEV_BSIZE; 225 if ((daddr_t) blockno != blockno) { 226 return EINVAL; /* blockno overflow */ 227 } 228 229 bp->b_lblkno = bp->b_blkno = blockno; 230 231 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 232 if (error != 0) 233 return error; 234 if (blkno == -1) { 235 236 /* Fill holes with NULs to preserve semantics */ 237 238 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 239 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 240 bp->b_bufsize = bp->b_bcount; 241 242 if (vmapbuf(bp, 1) < 0) 243 return EFAULT; 244 245 maybe_yield(); 246 bzero(bp->b_data, bp->b_bufsize); 247 248 /* Mark operation completed (similar to bufdone()) */ 249 250 bp->b_resid = 0; 251 bp->b_flags |= B_DONE; 252 return 0; 253 } 254 bp->b_blkno = blkno + blockoff; 255 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 256 257 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 258 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 259 bp->b_bufsize = bp->b_bcount; 260 261 if (vmapbuf(bp, 1) < 0) 262 return EFAULT; 263 264 BO_STRATEGY(&dp->v_bufobj, bp); 265 return 0; 266 } 267 268 269 static int 270 ffs_rawread_main(struct vnode *vp, 271 struct uio *uio) 272 { 273 int error, nerror; 274 struct buf *bp, *nbp, *tbp; 275 caddr_t sa, nsa, tsa; 276 u_int iolen; 277 int spl; 278 caddr_t udata; 279 long resid; 280 off_t offset; 281 struct thread *td; 282 283 td = uio->uio_td ? uio->uio_td : curthread; 284 udata = uio->uio_iov->iov_base; 285 resid = uio->uio_resid; 286 offset = uio->uio_offset; 287 288 /* 289 * keep the process from being swapped 290 */ 291 PHOLD(td->td_proc); 292 293 error = 0; 294 nerror = 0; 295 296 bp = NULL; 297 nbp = NULL; 298 sa = NULL; 299 nsa = NULL; 300 301 while (resid > 0) { 302 303 if (bp == NULL) { /* Setup first read */ 304 /* XXX: Leave some bufs for swap */ 305 bp = getpbuf(&ffsrawbufcnt); 306 sa = bp->b_data; 307 pbgetvp(vp, bp); 308 error = ffs_rawread_readahead(vp, udata, offset, 309 resid, td, bp, sa); 310 if (error != 0) 311 break; 312 313 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 314 /* XXX: Leave bufs for swap */ 315 if (rawreadahead != 0) 316 nbp = trypbuf(&ffsrawbufcnt); 317 else 318 nbp = NULL; 319 if (nbp != NULL) { 320 nsa = nbp->b_data; 321 pbgetvp(vp, nbp); 322 323 nerror = ffs_rawread_readahead(vp, 324 udata + 325 bp->b_bufsize, 326 offset + 327 bp->b_bufsize, 328 resid - 329 bp->b_bufsize, 330 td, 331 nbp, 332 nsa); 333 if (nerror) { 334 pbrelvp(nbp); 335 relpbuf(nbp, &ffsrawbufcnt); 336 nbp = NULL; 337 } 338 } 339 } 340 } 341 342 spl = splbio(); 343 bwait(bp, PRIBIO, "rawrd"); 344 splx(spl); 345 346 vunmapbuf(bp); 347 348 iolen = bp->b_bcount - bp->b_resid; 349 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 350 nerror = 0; /* Ignore possible beyond EOF error */ 351 break; /* EOF */ 352 } 353 354 if ((bp->b_ioflags & BIO_ERROR) != 0) { 355 error = bp->b_error; 356 break; 357 } 358 resid -= iolen; 359 udata += iolen; 360 offset += iolen; 361 if (iolen < bp->b_bufsize) { 362 /* Incomplete read. Try to read remaining part */ 363 error = ffs_rawread_readahead(vp, 364 udata, 365 offset, 366 bp->b_bufsize - iolen, 367 td, 368 bp, 369 sa); 370 if (error != 0) 371 break; 372 } else if (nbp != NULL) { /* Complete read with readahead */ 373 374 tbp = bp; 375 bp = nbp; 376 nbp = tbp; 377 378 tsa = sa; 379 sa = nsa; 380 nsa = tsa; 381 382 if (resid <= bp->b_bufsize) { /* No more readaheads */ 383 pbrelvp(nbp); 384 relpbuf(nbp, &ffsrawbufcnt); 385 nbp = NULL; 386 } else { /* Setup next readahead */ 387 nerror = ffs_rawread_readahead(vp, 388 udata + 389 bp->b_bufsize, 390 offset + 391 bp->b_bufsize, 392 resid - 393 bp->b_bufsize, 394 td, 395 nbp, 396 nsa); 397 if (nerror != 0) { 398 pbrelvp(nbp); 399 relpbuf(nbp, &ffsrawbufcnt); 400 nbp = NULL; 401 } 402 } 403 } else if (nerror != 0) {/* Deferred Readahead error */ 404 break; 405 } else if (resid > 0) { /* More to read, no readahead */ 406 error = ffs_rawread_readahead(vp, udata, offset, 407 resid, td, bp, sa); 408 if (error != 0) 409 break; 410 } 411 } 412 413 if (bp != NULL) { 414 pbrelvp(bp); 415 relpbuf(bp, &ffsrawbufcnt); 416 } 417 if (nbp != NULL) { /* Run down readahead buffer */ 418 spl = splbio(); 419 bwait(nbp, PRIBIO, "rawrd"); 420 splx(spl); 421 vunmapbuf(nbp); 422 pbrelvp(nbp); 423 relpbuf(nbp, &ffsrawbufcnt); 424 } 425 426 if (error == 0) 427 error = nerror; 428 PRELE(td->td_proc); 429 uio->uio_iov->iov_base = udata; 430 uio->uio_resid = resid; 431 uio->uio_offset = offset; 432 return error; 433 } 434 435 436 int 437 ffs_rawread(struct vnode *vp, 438 struct uio *uio, 439 int *workdone) 440 { 441 if (allowrawread != 0 && 442 uio->uio_iovcnt == 1 && 443 uio->uio_segflg == UIO_USERSPACE && 444 uio->uio_resid == uio->uio_iov->iov_len && 445 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 446 TDP_DEADLKTREAT) == 0) { 447 int secsize; /* Media sector size */ 448 off_t filebytes; /* Bytes left of file */ 449 int blockbytes; /* Bytes left of file in full blocks */ 450 int partialbytes; /* Bytes in last partial block */ 451 int skipbytes; /* Bytes not to read in ffs_rawread */ 452 struct inode *ip; 453 int error; 454 455 456 /* Only handle sector aligned reads */ 457 ip = VTOI(vp); 458 secsize = ip->i_devvp->v_bufobj.bo_bsize; 459 if ((uio->uio_offset & (secsize - 1)) == 0 && 460 (uio->uio_resid & (secsize - 1)) == 0) { 461 462 /* Sync dirty pages and buffers if needed */ 463 error = ffs_rawread_sync(vp); 464 if (error != 0) 465 return error; 466 467 /* Check for end of file */ 468 if (ip->i_size > uio->uio_offset) { 469 filebytes = ip->i_size - uio->uio_offset; 470 471 /* No special eof handling needed ? */ 472 if (uio->uio_resid <= filebytes) { 473 *workdone = 1; 474 return ffs_rawread_main(vp, uio); 475 } 476 477 partialbytes = ((unsigned int) ip->i_size) % 478 ip->i_fs->fs_bsize; 479 blockbytes = (int) filebytes - partialbytes; 480 if (blockbytes > 0) { 481 skipbytes = uio->uio_resid - 482 blockbytes; 483 uio->uio_resid = blockbytes; 484 error = ffs_rawread_main(vp, uio); 485 uio->uio_resid += skipbytes; 486 if (error != 0) 487 return error; 488 /* Read remaining part using buffer */ 489 } 490 } 491 } 492 } 493 *workdone = 0; 494 return 0; 495 } 496