1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2000-2003 Tor Egge 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/fcntl.h> 35 #include <sys/file.h> 36 #include <sys/stat.h> 37 #include <sys/proc.h> 38 #include <sys/limits.h> 39 #include <sys/mount.h> 40 #include <sys/namei.h> 41 #include <sys/vnode.h> 42 #include <sys/conf.h> 43 #include <sys/filio.h> 44 #include <sys/ttycom.h> 45 #include <sys/bio.h> 46 #include <sys/buf.h> 47 #include <sys/rwlock.h> 48 #include <ufs/ufs/extattr.h> 49 #include <ufs/ufs/quota.h> 50 #include <ufs/ufs/inode.h> 51 #include <ufs/ufs/ufsmount.h> 52 #include <ufs/ufs/ufs_extern.h> 53 #include <ufs/ffs/fs.h> 54 #include <ufs/ffs/ffs_extern.h> 55 56 #include <vm/vm.h> 57 #include <vm/vm_extern.h> 58 #include <vm/vm_object.h> 59 #include <sys/kernel.h> 60 #include <sys/sysctl.h> 61 62 static int ffs_rawread_readahead(struct vnode *vp, 63 caddr_t udata, 64 off_t offset, 65 size_t len, 66 struct thread *td, 67 struct buf *bp); 68 static int ffs_rawread_main(struct vnode *vp, 69 struct uio *uio); 70 71 static int ffs_rawread_sync(struct vnode *vp); 72 73 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 74 75 SYSCTL_DECL(_vfs_ffs); 76 77 static int ffsrawbufcnt = 4; 78 SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, 79 "Buffers available for raw reads"); 80 81 static int allowrawread = 1; 82 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 83 "Flag to enable raw reads"); 84 85 static int rawreadahead = 1; 86 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 87 "Flag to enable readahead for long raw reads"); 88 89 static void 90 ffs_rawread_setup(void *arg __unused) 91 { 92 93 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; 94 } 95 SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); 96 97 static int 98 ffs_rawread_sync(struct vnode *vp) 99 { 100 int error; 101 int upgraded; 102 struct bufobj *bo; 103 struct mount *mp; 104 vm_object_t obj; 105 106 /* Check for dirty mmap, pending writes and dirty buffers */ 107 bo = &vp->v_bufobj; 108 BO_LOCK(bo); 109 VI_LOCK(vp); 110 if (bo->bo_numoutput > 0 || 111 bo->bo_dirty.bv_cnt > 0 || 112 ((obj = vp->v_object) != NULL && 113 (obj->flags & OBJ_MIGHTBEDIRTY) != 0)) { 114 VI_UNLOCK(vp); 115 BO_UNLOCK(bo); 116 117 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 118 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 119 upgraded = 1; 120 else 121 upgraded = 0; 122 VOP_UNLOCK(vp, 0); 123 (void) vn_start_write(vp, &mp, V_WAIT); 124 VOP_LOCK(vp, LK_EXCLUSIVE); 125 } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 126 upgraded = 1; 127 /* Upgrade to exclusive lock, this might block */ 128 VOP_LOCK(vp, LK_UPGRADE); 129 } else 130 upgraded = 0; 131 132 133 VI_LOCK(vp); 134 /* Check if vnode was reclaimed while unlocked. */ 135 if ((vp->v_iflag & VI_DOOMED) != 0) { 136 VI_UNLOCK(vp); 137 if (upgraded != 0) 138 VOP_LOCK(vp, LK_DOWNGRADE); 139 vn_finished_write(mp); 140 return (EIO); 141 } 142 /* Attempt to msync mmap() regions to clean dirty mmap */ 143 if ((obj = vp->v_object) != NULL && 144 (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 145 VI_UNLOCK(vp); 146 VM_OBJECT_WLOCK(obj); 147 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 148 VM_OBJECT_WUNLOCK(obj); 149 } else 150 VI_UNLOCK(vp); 151 152 /* Wait for pending writes to complete */ 153 BO_LOCK(bo); 154 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 155 if (error != 0) { 156 /* XXX: can't happen with a zero timeout ??? */ 157 BO_UNLOCK(bo); 158 if (upgraded != 0) 159 VOP_LOCK(vp, LK_DOWNGRADE); 160 vn_finished_write(mp); 161 return (error); 162 } 163 /* Flush dirty buffers */ 164 if (bo->bo_dirty.bv_cnt > 0) { 165 BO_UNLOCK(bo); 166 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { 167 if (upgraded != 0) 168 VOP_LOCK(vp, LK_DOWNGRADE); 169 vn_finished_write(mp); 170 return (error); 171 } 172 BO_LOCK(bo); 173 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 174 panic("ffs_rawread_sync: dirty bufs"); 175 } 176 BO_UNLOCK(bo); 177 if (upgraded != 0) 178 VOP_LOCK(vp, LK_DOWNGRADE); 179 vn_finished_write(mp); 180 } else { 181 VI_UNLOCK(vp); 182 BO_UNLOCK(bo); 183 } 184 return 0; 185 } 186 187 188 static int 189 ffs_rawread_readahead(struct vnode *vp, 190 caddr_t udata, 191 off_t offset, 192 size_t len, 193 struct thread *td, 194 struct buf *bp) 195 { 196 int error; 197 u_int iolen; 198 off_t blockno; 199 int blockoff; 200 int bsize; 201 struct vnode *dp; 202 int bforwards; 203 struct inode *ip; 204 ufs2_daddr_t blkno; 205 206 bsize = vp->v_mount->mnt_stat.f_iosize; 207 208 ip = VTOI(vp); 209 dp = ITODEVVP(ip); 210 211 iolen = ((vm_offset_t) udata) & PAGE_MASK; 212 bp->b_bcount = len; 213 if (bp->b_bcount + iolen > bp->b_kvasize) { 214 bp->b_bcount = bp->b_kvasize; 215 if (iolen != 0) 216 bp->b_bcount -= PAGE_SIZE; 217 } 218 bp->b_flags = 0; /* XXX necessary ? */ 219 bp->b_iocmd = BIO_READ; 220 bp->b_iodone = bdone; 221 bp->b_data = udata; 222 blockno = offset / bsize; 223 blockoff = (offset % bsize) / DEV_BSIZE; 224 if ((daddr_t) blockno != blockno) { 225 return EINVAL; /* blockno overflow */ 226 } 227 228 bp->b_lblkno = bp->b_blkno = blockno; 229 230 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 231 if (error != 0) 232 return error; 233 if (blkno == -1) { 234 235 /* Fill holes with NULs to preserve semantics */ 236 237 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 238 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 239 bp->b_bufsize = bp->b_bcount; 240 241 if (vmapbuf(bp, 1) < 0) 242 return EFAULT; 243 244 maybe_yield(); 245 bzero(bp->b_data, bp->b_bufsize); 246 247 /* Mark operation completed (similar to bufdone()) */ 248 249 bp->b_resid = 0; 250 bp->b_flags |= B_DONE; 251 return 0; 252 } 253 bp->b_blkno = blkno + blockoff; 254 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 255 256 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 257 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 258 bp->b_bufsize = bp->b_bcount; 259 260 if (vmapbuf(bp, 1) < 0) 261 return EFAULT; 262 263 BO_STRATEGY(&dp->v_bufobj, bp); 264 return 0; 265 } 266 267 268 static int 269 ffs_rawread_main(struct vnode *vp, 270 struct uio *uio) 271 { 272 int error, nerror; 273 struct buf *bp, *nbp, *tbp; 274 u_int iolen; 275 caddr_t udata; 276 long resid; 277 off_t offset; 278 struct thread *td; 279 280 td = uio->uio_td ? uio->uio_td : curthread; 281 udata = uio->uio_iov->iov_base; 282 resid = uio->uio_resid; 283 offset = uio->uio_offset; 284 285 /* 286 * keep the process from being swapped 287 */ 288 PHOLD(td->td_proc); 289 290 error = 0; 291 nerror = 0; 292 293 bp = NULL; 294 nbp = NULL; 295 296 while (resid > 0) { 297 298 if (bp == NULL) { /* Setup first read */ 299 /* XXX: Leave some bufs for swap */ 300 bp = getpbuf(&ffsrawbufcnt); 301 pbgetvp(vp, bp); 302 error = ffs_rawread_readahead(vp, udata, offset, 303 resid, td, bp); 304 if (error != 0) 305 break; 306 307 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 308 /* XXX: Leave bufs for swap */ 309 if (rawreadahead != 0) 310 nbp = trypbuf(&ffsrawbufcnt); 311 else 312 nbp = NULL; 313 if (nbp != NULL) { 314 pbgetvp(vp, nbp); 315 316 nerror = ffs_rawread_readahead(vp, 317 udata + 318 bp->b_bufsize, 319 offset + 320 bp->b_bufsize, 321 resid - 322 bp->b_bufsize, 323 td, 324 nbp); 325 if (nerror) { 326 pbrelvp(nbp); 327 relpbuf(nbp, &ffsrawbufcnt); 328 nbp = NULL; 329 } 330 } 331 } 332 } 333 334 bwait(bp, PRIBIO, "rawrd"); 335 vunmapbuf(bp); 336 337 iolen = bp->b_bcount - bp->b_resid; 338 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 339 nerror = 0; /* Ignore possible beyond EOF error */ 340 break; /* EOF */ 341 } 342 343 if ((bp->b_ioflags & BIO_ERROR) != 0) { 344 error = bp->b_error; 345 break; 346 } 347 resid -= iolen; 348 udata += iolen; 349 offset += iolen; 350 if (iolen < bp->b_bufsize) { 351 /* Incomplete read. Try to read remaining part */ 352 error = ffs_rawread_readahead(vp, 353 udata, 354 offset, 355 bp->b_bufsize - iolen, 356 td, 357 bp); 358 if (error != 0) 359 break; 360 } else if (nbp != NULL) { /* Complete read with readahead */ 361 362 tbp = bp; 363 bp = nbp; 364 nbp = tbp; 365 366 if (resid <= bp->b_bufsize) { /* No more readaheads */ 367 pbrelvp(nbp); 368 relpbuf(nbp, &ffsrawbufcnt); 369 nbp = NULL; 370 } else { /* Setup next readahead */ 371 nerror = ffs_rawread_readahead(vp, 372 udata + 373 bp->b_bufsize, 374 offset + 375 bp->b_bufsize, 376 resid - 377 bp->b_bufsize, 378 td, 379 nbp); 380 if (nerror != 0) { 381 pbrelvp(nbp); 382 relpbuf(nbp, &ffsrawbufcnt); 383 nbp = NULL; 384 } 385 } 386 } else if (nerror != 0) {/* Deferred Readahead error */ 387 break; 388 } else if (resid > 0) { /* More to read, no readahead */ 389 error = ffs_rawread_readahead(vp, udata, offset, 390 resid, td, bp); 391 if (error != 0) 392 break; 393 } 394 } 395 396 if (bp != NULL) { 397 pbrelvp(bp); 398 relpbuf(bp, &ffsrawbufcnt); 399 } 400 if (nbp != NULL) { /* Run down readahead buffer */ 401 bwait(nbp, PRIBIO, "rawrd"); 402 vunmapbuf(nbp); 403 pbrelvp(nbp); 404 relpbuf(nbp, &ffsrawbufcnt); 405 } 406 407 if (error == 0) 408 error = nerror; 409 PRELE(td->td_proc); 410 uio->uio_iov->iov_base = udata; 411 uio->uio_resid = resid; 412 uio->uio_offset = offset; 413 return error; 414 } 415 416 417 int 418 ffs_rawread(struct vnode *vp, 419 struct uio *uio, 420 int *workdone) 421 { 422 if (allowrawread != 0 && 423 uio->uio_iovcnt == 1 && 424 uio->uio_segflg == UIO_USERSPACE && 425 uio->uio_resid == uio->uio_iov->iov_len && 426 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 427 TDP_DEADLKTREAT) == 0) { 428 int secsize; /* Media sector size */ 429 off_t filebytes; /* Bytes left of file */ 430 int blockbytes; /* Bytes left of file in full blocks */ 431 int partialbytes; /* Bytes in last partial block */ 432 int skipbytes; /* Bytes not to read in ffs_rawread */ 433 struct inode *ip; 434 int error; 435 436 437 /* Only handle sector aligned reads */ 438 ip = VTOI(vp); 439 secsize = ITODEVVP(ip)->v_bufobj.bo_bsize; 440 if ((uio->uio_offset & (secsize - 1)) == 0 && 441 (uio->uio_resid & (secsize - 1)) == 0) { 442 443 /* Sync dirty pages and buffers if needed */ 444 error = ffs_rawread_sync(vp); 445 if (error != 0) 446 return error; 447 448 /* Check for end of file */ 449 if (ip->i_size > uio->uio_offset) { 450 filebytes = ip->i_size - uio->uio_offset; 451 452 /* No special eof handling needed ? */ 453 if (uio->uio_resid <= filebytes) { 454 *workdone = 1; 455 return ffs_rawread_main(vp, uio); 456 } 457 458 partialbytes = ((unsigned int) ip->i_size) % 459 ITOFS(ip)->fs_bsize; 460 blockbytes = (int) filebytes - partialbytes; 461 if (blockbytes > 0) { 462 skipbytes = uio->uio_resid - 463 blockbytes; 464 uio->uio_resid = blockbytes; 465 error = ffs_rawread_main(vp, uio); 466 uio->uio_resid += skipbytes; 467 if (error != 0) 468 return error; 469 /* Read remaining part using buffer */ 470 } 471 } 472 } 473 } 474 *workdone = 0; 475 return 0; 476 } 477