1 /*- 2 * Copyright (c) 2000-2003 Tor Egge 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/fcntl.h> 33 #include <sys/file.h> 34 #include <sys/stat.h> 35 #include <sys/proc.h> 36 #include <sys/limits.h> 37 #include <sys/mount.h> 38 #include <sys/namei.h> 39 #include <sys/vnode.h> 40 #include <sys/conf.h> 41 #include <sys/filio.h> 42 #include <sys/ttycom.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <sys/rwlock.h> 46 #include <ufs/ufs/extattr.h> 47 #include <ufs/ufs/quota.h> 48 #include <ufs/ufs/inode.h> 49 #include <ufs/ufs/ufsmount.h> 50 #include <ufs/ufs/ufs_extern.h> 51 #include <ufs/ffs/fs.h> 52 #include <ufs/ffs/ffs_extern.h> 53 54 #include <vm/vm.h> 55 #include <vm/vm_extern.h> 56 #include <vm/vm_object.h> 57 #include <sys/kernel.h> 58 #include <sys/sysctl.h> 59 60 static int ffs_rawread_readahead(struct vnode *vp, 61 caddr_t udata, 62 off_t offset, 63 size_t len, 64 struct thread *td, 65 struct buf *bp); 66 static int ffs_rawread_main(struct vnode *vp, 67 struct uio *uio); 68 69 static int ffs_rawread_sync(struct vnode *vp); 70 71 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 72 73 SYSCTL_DECL(_vfs_ffs); 74 75 static int ffsrawbufcnt = 4; 76 SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 77 "Buffers available for raw reads"); 78 79 static int allowrawread = 1; 80 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 81 "Flag to enable raw reads"); 82 83 static int rawreadahead = 1; 84 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 85 "Flag to enable readahead for long raw reads"); 86 87 static void 88 ffs_rawread_setup(void *arg __unused) 89 { 90 91 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 92 } 93 SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); 94 95 static int 96 ffs_rawread_sync(struct vnode *vp) 97 { 98 int error; 99 int upgraded; 100 struct bufobj *bo; 101 struct mount *mp; 102 vm_object_t obj; 103 104 /* Check for dirty mmap, pending writes and dirty buffers */ 105 bo = &vp->v_bufobj; 106 BO_LOCK(bo); 107 VI_LOCK(vp); 108 if (bo->bo_numoutput > 0 || 109 bo->bo_dirty.bv_cnt > 0 || 110 ((obj = vp->v_object) != NULL && 111 (obj->flags & OBJ_MIGHTBEDIRTY) != 0)) { 112 VI_UNLOCK(vp); 113 BO_UNLOCK(bo); 114 115 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 116 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 117 upgraded = 1; 118 else 119 upgraded = 0; 120 VOP_UNLOCK(vp, 0); 121 (void) vn_start_write(vp, &mp, V_WAIT); 122 VOP_LOCK(vp, LK_EXCLUSIVE); 123 } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 124 upgraded = 1; 125 /* Upgrade to exclusive lock, this might block */ 126 VOP_LOCK(vp, LK_UPGRADE); 127 } else 128 upgraded = 0; 129 130 131 VI_LOCK(vp); 132 /* Check if vnode was reclaimed while unlocked. */ 133 if ((vp->v_iflag & VI_DOOMED) != 0) { 134 VI_UNLOCK(vp); 135 if (upgraded != 0) 136 VOP_LOCK(vp, LK_DOWNGRADE); 137 vn_finished_write(mp); 138 return (EIO); 139 } 140 /* Attempt to msync mmap() regions to clean dirty mmap */ 141 if ((obj = vp->v_object) != NULL && 142 (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 143 VI_UNLOCK(vp); 144 VM_OBJECT_WLOCK(obj); 145 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 146 VM_OBJECT_WUNLOCK(obj); 147 } else 148 VI_UNLOCK(vp); 149 150 /* Wait for pending writes to complete */ 151 BO_LOCK(bo); 152 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 153 if (error != 0) { 154 /* XXX: can't happen with a zero timeout ??? */ 155 BO_UNLOCK(bo); 156 if (upgraded != 0) 157 VOP_LOCK(vp, LK_DOWNGRADE); 158 vn_finished_write(mp); 159 return (error); 160 } 161 /* Flush dirty buffers */ 162 if (bo->bo_dirty.bv_cnt > 0) { 163 BO_UNLOCK(bo); 164 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { 165 if (upgraded != 0) 166 VOP_LOCK(vp, LK_DOWNGRADE); 167 vn_finished_write(mp); 168 return (error); 169 } 170 BO_LOCK(bo); 171 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 172 panic("ffs_rawread_sync: dirty bufs"); 173 } 174 BO_UNLOCK(bo); 175 if (upgraded != 0) 176 VOP_LOCK(vp, LK_DOWNGRADE); 177 vn_finished_write(mp); 178 } else { 179 VI_UNLOCK(vp); 180 BO_UNLOCK(bo); 181 } 182 return 0; 183 } 184 185 186 static int 187 ffs_rawread_readahead(struct vnode *vp, 188 caddr_t udata, 189 off_t offset, 190 size_t len, 191 struct thread *td, 192 struct buf *bp) 193 { 194 int error; 195 u_int iolen; 196 off_t blockno; 197 int blockoff; 198 int bsize; 199 struct vnode *dp; 200 int bforwards; 201 struct inode *ip; 202 ufs2_daddr_t blkno; 203 204 bsize = vp->v_mount->mnt_stat.f_iosize; 205 206 ip = VTOI(vp); 207 dp = ip->i_devvp; 208 209 iolen = ((vm_offset_t) udata) & PAGE_MASK; 210 bp->b_bcount = len; 211 if (bp->b_bcount + iolen > bp->b_kvasize) { 212 bp->b_bcount = bp->b_kvasize; 213 if (iolen != 0) 214 bp->b_bcount -= PAGE_SIZE; 215 } 216 bp->b_flags = 0; /* XXX necessary ? */ 217 bp->b_iocmd = BIO_READ; 218 bp->b_iodone = bdone; 219 bp->b_data = udata; 220 blockno = offset / bsize; 221 blockoff = (offset % bsize) / DEV_BSIZE; 222 if ((daddr_t) blockno != blockno) { 223 return EINVAL; /* blockno overflow */ 224 } 225 226 bp->b_lblkno = bp->b_blkno = blockno; 227 228 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 229 if (error != 0) 230 return error; 231 if (blkno == -1) { 232 233 /* Fill holes with NULs to preserve semantics */ 234 235 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 236 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 237 bp->b_bufsize = bp->b_bcount; 238 239 if (vmapbuf(bp, 1) < 0) 240 return EFAULT; 241 242 maybe_yield(); 243 bzero(bp->b_data, bp->b_bufsize); 244 245 /* Mark operation completed (similar to bufdone()) */ 246 247 bp->b_resid = 0; 248 bp->b_flags |= B_DONE; 249 return 0; 250 } 251 bp->b_blkno = blkno + blockoff; 252 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 253 254 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 255 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 256 bp->b_bufsize = bp->b_bcount; 257 258 if (vmapbuf(bp, 1) < 0) 259 return EFAULT; 260 261 BO_STRATEGY(&dp->v_bufobj, bp); 262 return 0; 263 } 264 265 266 static int 267 ffs_rawread_main(struct vnode *vp, 268 struct uio *uio) 269 { 270 int error, nerror; 271 struct buf *bp, *nbp, *tbp; 272 u_int iolen; 273 int spl; 274 caddr_t udata; 275 long resid; 276 off_t offset; 277 struct thread *td; 278 279 td = uio->uio_td ? uio->uio_td : curthread; 280 udata = uio->uio_iov->iov_base; 281 resid = uio->uio_resid; 282 offset = uio->uio_offset; 283 284 /* 285 * keep the process from being swapped 286 */ 287 PHOLD(td->td_proc); 288 289 error = 0; 290 nerror = 0; 291 292 bp = NULL; 293 nbp = NULL; 294 295 while (resid > 0) { 296 297 if (bp == NULL) { /* Setup first read */ 298 /* XXX: Leave some bufs for swap */ 299 bp = getpbuf(&ffsrawbufcnt); 300 pbgetvp(vp, bp); 301 error = ffs_rawread_readahead(vp, udata, offset, 302 resid, td, bp); 303 if (error != 0) 304 break; 305 306 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 307 /* XXX: Leave bufs for swap */ 308 if (rawreadahead != 0) 309 nbp = trypbuf(&ffsrawbufcnt); 310 else 311 nbp = NULL; 312 if (nbp != NULL) { 313 pbgetvp(vp, nbp); 314 315 nerror = ffs_rawread_readahead(vp, 316 udata + 317 bp->b_bufsize, 318 offset + 319 bp->b_bufsize, 320 resid - 321 bp->b_bufsize, 322 td, 323 nbp); 324 if (nerror) { 325 pbrelvp(nbp); 326 relpbuf(nbp, &ffsrawbufcnt); 327 nbp = NULL; 328 } 329 } 330 } 331 } 332 333 spl = splbio(); 334 bwait(bp, PRIBIO, "rawrd"); 335 splx(spl); 336 337 vunmapbuf(bp); 338 339 iolen = bp->b_bcount - bp->b_resid; 340 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 341 nerror = 0; /* Ignore possible beyond EOF error */ 342 break; /* EOF */ 343 } 344 345 if ((bp->b_ioflags & BIO_ERROR) != 0) { 346 error = bp->b_error; 347 break; 348 } 349 resid -= iolen; 350 udata += iolen; 351 offset += iolen; 352 if (iolen < bp->b_bufsize) { 353 /* Incomplete read. Try to read remaining part */ 354 error = ffs_rawread_readahead(vp, 355 udata, 356 offset, 357 bp->b_bufsize - iolen, 358 td, 359 bp); 360 if (error != 0) 361 break; 362 } else if (nbp != NULL) { /* Complete read with readahead */ 363 364 tbp = bp; 365 bp = nbp; 366 nbp = tbp; 367 368 if (resid <= bp->b_bufsize) { /* No more readaheads */ 369 pbrelvp(nbp); 370 relpbuf(nbp, &ffsrawbufcnt); 371 nbp = NULL; 372 } else { /* Setup next readahead */ 373 nerror = ffs_rawread_readahead(vp, 374 udata + 375 bp->b_bufsize, 376 offset + 377 bp->b_bufsize, 378 resid - 379 bp->b_bufsize, 380 td, 381 nbp); 382 if (nerror != 0) { 383 pbrelvp(nbp); 384 relpbuf(nbp, &ffsrawbufcnt); 385 nbp = NULL; 386 } 387 } 388 } else if (nerror != 0) {/* Deferred Readahead error */ 389 break; 390 } else if (resid > 0) { /* More to read, no readahead */ 391 error = ffs_rawread_readahead(vp, udata, offset, 392 resid, td, bp); 393 if (error != 0) 394 break; 395 } 396 } 397 398 if (bp != NULL) { 399 pbrelvp(bp); 400 relpbuf(bp, &ffsrawbufcnt); 401 } 402 if (nbp != NULL) { /* Run down readahead buffer */ 403 spl = splbio(); 404 bwait(nbp, PRIBIO, "rawrd"); 405 splx(spl); 406 vunmapbuf(nbp); 407 pbrelvp(nbp); 408 relpbuf(nbp, &ffsrawbufcnt); 409 } 410 411 if (error == 0) 412 error = nerror; 413 PRELE(td->td_proc); 414 uio->uio_iov->iov_base = udata; 415 uio->uio_resid = resid; 416 uio->uio_offset = offset; 417 return error; 418 } 419 420 421 int 422 ffs_rawread(struct vnode *vp, 423 struct uio *uio, 424 int *workdone) 425 { 426 if (allowrawread != 0 && 427 uio->uio_iovcnt == 1 && 428 uio->uio_segflg == UIO_USERSPACE && 429 uio->uio_resid == uio->uio_iov->iov_len && 430 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 431 TDP_DEADLKTREAT) == 0) { 432 int secsize; /* Media sector size */ 433 off_t filebytes; /* Bytes left of file */ 434 int blockbytes; /* Bytes left of file in full blocks */ 435 int partialbytes; /* Bytes in last partial block */ 436 int skipbytes; /* Bytes not to read in ffs_rawread */ 437 struct inode *ip; 438 int error; 439 440 441 /* Only handle sector aligned reads */ 442 ip = VTOI(vp); 443 secsize = ip->i_devvp->v_bufobj.bo_bsize; 444 if ((uio->uio_offset & (secsize - 1)) == 0 && 445 (uio->uio_resid & (secsize - 1)) == 0) { 446 447 /* Sync dirty pages and buffers if needed */ 448 error = ffs_rawread_sync(vp); 449 if (error != 0) 450 return error; 451 452 /* Check for end of file */ 453 if (ip->i_size > uio->uio_offset) { 454 filebytes = ip->i_size - uio->uio_offset; 455 456 /* No special eof handling needed ? */ 457 if (uio->uio_resid <= filebytes) { 458 *workdone = 1; 459 return ffs_rawread_main(vp, uio); 460 } 461 462 partialbytes = ((unsigned int) ip->i_size) % 463 ip->i_fs->fs_bsize; 464 blockbytes = (int) filebytes - partialbytes; 465 if (blockbytes > 0) { 466 skipbytes = uio->uio_resid - 467 blockbytes; 468 uio->uio_resid = blockbytes; 469 error = ffs_rawread_main(vp, uio); 470 uio->uio_resid += skipbytes; 471 if (error != 0) 472 return error; 473 /* Read remaining part using buffer */ 474 } 475 } 476 } 477 } 478 *workdone = 0; 479 return 0; 480 } 481