1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2000-2003 Tor Egge 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/fcntl.h> 33 #include <sys/file.h> 34 #include <sys/stat.h> 35 #include <sys/proc.h> 36 #include <sys/limits.h> 37 #include <sys/mount.h> 38 #include <sys/namei.h> 39 #include <sys/vnode.h> 40 #include <sys/conf.h> 41 #include <sys/filio.h> 42 #include <sys/ttycom.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <sys/rwlock.h> 46 #include <ufs/ufs/extattr.h> 47 #include <ufs/ufs/quota.h> 48 #include <ufs/ufs/inode.h> 49 #include <ufs/ufs/ufsmount.h> 50 #include <ufs/ufs/ufs_extern.h> 51 #include <ufs/ffs/fs.h> 52 #include <ufs/ffs/ffs_extern.h> 53 54 #include <vm/vm.h> 55 #include <vm/vm_extern.h> 56 #include <vm/vm_object.h> 57 #include <sys/kernel.h> 58 #include <sys/sysctl.h> 59 60 static int ffs_rawread_readahead(struct vnode *vp, 61 caddr_t udata, 62 off_t offset, 63 size_t len, 64 struct thread *td, 65 struct buf *bp); 66 static int ffs_rawread_main(struct vnode *vp, 67 struct uio *uio); 68 69 static int ffs_rawread_sync(struct vnode *vp); 70 71 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 72 73 SYSCTL_DECL(_vfs_ffs); 74 75 static uma_zone_t ffsraw_pbuf_zone; 76 77 static int allowrawread = 1; 78 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 79 "Flag to enable raw reads"); 80 81 static int rawreadahead = 1; 82 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 83 "Flag to enable readahead for long raw reads"); 84 85 static void 86 ffs_rawread_setup(void *arg __unused) 87 { 88 89 ffsraw_pbuf_zone = pbuf_zsecond_create("ffsrawpbuf", 90 (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8); 91 } 92 SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); 93 94 static int 95 ffs_rawread_sync(struct vnode *vp) 96 { 97 int error; 98 int upgraded; 99 struct bufobj *bo; 100 struct mount *mp; 101 vm_object_t obj; 102 103 /* Check for dirty mmap, pending writes and dirty buffers */ 104 bo = &vp->v_bufobj; 105 BO_LOCK(bo); 106 VI_LOCK(vp); 107 if (bo->bo_numoutput > 0 || 108 bo->bo_dirty.bv_cnt > 0 || 109 ((obj = vp->v_object) != NULL && 110 vm_object_mightbedirty(obj))) { 111 VI_UNLOCK(vp); 112 BO_UNLOCK(bo); 113 114 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 115 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 116 upgraded = 1; 117 else 118 upgraded = 0; 119 VOP_UNLOCK(vp); 120 (void) vn_start_write(vp, &mp, V_WAIT); 121 VOP_LOCK(vp, LK_EXCLUSIVE); 122 } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 123 upgraded = 1; 124 /* Upgrade to exclusive lock, this might block */ 125 VOP_LOCK(vp, LK_UPGRADE); 126 } else 127 upgraded = 0; 128 129 130 VI_LOCK(vp); 131 /* Check if vnode was reclaimed while unlocked. */ 132 if (VN_IS_DOOMED(vp)) { 133 VI_UNLOCK(vp); 134 if (upgraded != 0) 135 VOP_LOCK(vp, LK_DOWNGRADE); 136 vn_finished_write(mp); 137 return (EIO); 138 } 139 /* Attempt to msync mmap() regions to clean dirty mmap */ 140 if ((obj = vp->v_object) != NULL && 141 vm_object_mightbedirty(obj)) { 142 VI_UNLOCK(vp); 143 VM_OBJECT_WLOCK(obj); 144 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 145 VM_OBJECT_WUNLOCK(obj); 146 } else 147 VI_UNLOCK(vp); 148 149 /* Wait for pending writes to complete */ 150 BO_LOCK(bo); 151 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 152 if (error != 0) { 153 /* XXX: can't happen with a zero timeout ??? */ 154 BO_UNLOCK(bo); 155 if (upgraded != 0) 156 VOP_LOCK(vp, LK_DOWNGRADE); 157 vn_finished_write(mp); 158 return (error); 159 } 160 /* Flush dirty buffers */ 161 if (bo->bo_dirty.bv_cnt > 0) { 162 BO_UNLOCK(bo); 163 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { 164 if (upgraded != 0) 165 VOP_LOCK(vp, LK_DOWNGRADE); 166 vn_finished_write(mp); 167 return (error); 168 } 169 BO_LOCK(bo); 170 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 171 panic("ffs_rawread_sync: dirty bufs"); 172 } 173 BO_UNLOCK(bo); 174 if (upgraded != 0) 175 VOP_LOCK(vp, LK_DOWNGRADE); 176 vn_finished_write(mp); 177 } else { 178 VI_UNLOCK(vp); 179 BO_UNLOCK(bo); 180 } 181 return 0; 182 } 183 184 static int 185 ffs_rawread_readahead(struct vnode *vp, 186 caddr_t udata, 187 off_t offset, 188 size_t len, 189 struct thread *td, 190 struct buf *bp) 191 { 192 int error; 193 uint64_t iolen; 194 off_t blockno; 195 int blockoff; 196 int bsize; 197 struct vnode *dp; 198 int bforwards; 199 struct inode *ip; 200 ufs2_daddr_t blkno; 201 202 bsize = vp->v_mount->mnt_stat.f_iosize; 203 204 ip = VTOI(vp); 205 dp = ITODEVVP(ip); 206 207 iolen = ((vm_offset_t) udata) & PAGE_MASK; 208 bp->b_bcount = len; 209 if (bp->b_bcount + iolen > bp->b_kvasize) { 210 bp->b_bcount = bp->b_kvasize; 211 if (iolen != 0) 212 bp->b_bcount -= PAGE_SIZE; 213 } 214 bp->b_flags = 0; /* XXX necessary ? */ 215 bp->b_iocmd = BIO_READ; 216 bp->b_iodone = bdone; 217 blockno = offset / bsize; 218 blockoff = (offset % bsize) / DEV_BSIZE; 219 if ((daddr_t) blockno != blockno) { 220 return EINVAL; /* blockno overflow */ 221 } 222 223 bp->b_lblkno = bp->b_blkno = blockno; 224 225 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 226 if (error != 0) 227 return error; 228 if (blkno == -1) { 229 /* Fill holes with NULs to preserve semantics */ 230 231 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 232 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 233 234 if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0) 235 return EFAULT; 236 237 maybe_yield(); 238 bzero(bp->b_data, bp->b_bufsize); 239 240 /* Mark operation completed (similar to bufdone()) */ 241 242 bp->b_resid = 0; 243 bp->b_flags |= B_DONE; 244 return 0; 245 } 246 bp->b_blkno = blkno + blockoff; 247 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 248 249 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 250 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 251 252 if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0) 253 return EFAULT; 254 255 BO_STRATEGY(&dp->v_bufobj, bp); 256 return 0; 257 } 258 259 static int 260 ffs_rawread_main(struct vnode *vp, 261 struct uio *uio) 262 { 263 int error, nerror; 264 struct buf *bp, *nbp, *tbp; 265 uint64_t iolen; 266 caddr_t udata; 267 long resid; 268 off_t offset; 269 struct thread *td; 270 271 td = uio->uio_td ? uio->uio_td : curthread; 272 udata = uio->uio_iov->iov_base; 273 resid = uio->uio_resid; 274 offset = uio->uio_offset; 275 276 /* 277 * keep the process from being swapped 278 */ 279 PHOLD(td->td_proc); 280 281 error = 0; 282 nerror = 0; 283 284 bp = NULL; 285 nbp = NULL; 286 287 while (resid > 0) { 288 289 if (bp == NULL) { /* Setup first read */ 290 bp = uma_zalloc(ffsraw_pbuf_zone, M_WAITOK); 291 pbgetvp(vp, bp); 292 error = ffs_rawread_readahead(vp, udata, offset, 293 resid, td, bp); 294 if (error != 0) 295 break; 296 297 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 298 if (rawreadahead != 0) 299 nbp = uma_zalloc(ffsraw_pbuf_zone, 300 M_NOWAIT); 301 else 302 nbp = NULL; 303 if (nbp != NULL) { 304 pbgetvp(vp, nbp); 305 306 nerror = ffs_rawread_readahead(vp, 307 udata + 308 bp->b_bufsize, 309 offset + 310 bp->b_bufsize, 311 resid - 312 bp->b_bufsize, 313 td, 314 nbp); 315 if (nerror) { 316 pbrelvp(nbp); 317 uma_zfree(ffsraw_pbuf_zone, 318 nbp); 319 nbp = NULL; 320 } 321 } 322 } 323 } 324 325 bwait(bp, PRIBIO, "rawrd"); 326 vunmapbuf(bp); 327 328 iolen = bp->b_bcount - bp->b_resid; 329 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 330 nerror = 0; /* Ignore possible beyond EOF error */ 331 break; /* EOF */ 332 } 333 334 if ((bp->b_ioflags & BIO_ERROR) != 0) { 335 error = bp->b_error; 336 break; 337 } 338 resid -= iolen; 339 udata += iolen; 340 offset += iolen; 341 if (iolen < bp->b_bufsize) { 342 /* Incomplete read. Try to read remaining part */ 343 error = ffs_rawread_readahead(vp, 344 udata, 345 offset, 346 bp->b_bufsize - iolen, 347 td, 348 bp); 349 if (error != 0) 350 break; 351 } else if (nbp != NULL) { /* Complete read with readahead */ 352 353 tbp = bp; 354 bp = nbp; 355 nbp = tbp; 356 357 if (resid <= bp->b_bufsize) { /* No more readaheads */ 358 pbrelvp(nbp); 359 uma_zfree(ffsraw_pbuf_zone, nbp); 360 nbp = NULL; 361 } else { /* Setup next readahead */ 362 nerror = ffs_rawread_readahead(vp, 363 udata + 364 bp->b_bufsize, 365 offset + 366 bp->b_bufsize, 367 resid - 368 bp->b_bufsize, 369 td, 370 nbp); 371 if (nerror != 0) { 372 pbrelvp(nbp); 373 uma_zfree(ffsraw_pbuf_zone, nbp); 374 nbp = NULL; 375 } 376 } 377 } else if (nerror != 0) {/* Deferred Readahead error */ 378 break; 379 } else if (resid > 0) { /* More to read, no readahead */ 380 error = ffs_rawread_readahead(vp, udata, offset, 381 resid, td, bp); 382 if (error != 0) 383 break; 384 } 385 } 386 387 if (bp != NULL) { 388 pbrelvp(bp); 389 uma_zfree(ffsraw_pbuf_zone, bp); 390 } 391 if (nbp != NULL) { /* Run down readahead buffer */ 392 bwait(nbp, PRIBIO, "rawrd"); 393 vunmapbuf(nbp); 394 pbrelvp(nbp); 395 uma_zfree(ffsraw_pbuf_zone, nbp); 396 } 397 398 if (error == 0) 399 error = nerror; 400 PRELE(td->td_proc); 401 uio->uio_iov->iov_base = udata; 402 uio->uio_resid = resid; 403 uio->uio_offset = offset; 404 return error; 405 } 406 407 int 408 ffs_rawread(struct vnode *vp, 409 struct uio *uio, 410 int *workdone) 411 { 412 if (allowrawread != 0 && 413 uio->uio_iovcnt == 1 && 414 uio->uio_segflg == UIO_USERSPACE && 415 uio->uio_resid == uio->uio_iov->iov_len && 416 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 417 TDP_DEADLKTREAT) == 0) { 418 int secsize; /* Media sector size */ 419 off_t filebytes; /* Bytes left of file */ 420 int blockbytes; /* Bytes left of file in full blocks */ 421 int partialbytes; /* Bytes in last partial block */ 422 int skipbytes; /* Bytes not to read in ffs_rawread */ 423 struct inode *ip; 424 int error; 425 426 427 /* Only handle sector aligned reads */ 428 ip = VTOI(vp); 429 secsize = ITODEVVP(ip)->v_bufobj.bo_bsize; 430 if ((uio->uio_offset & (secsize - 1)) == 0 && 431 (uio->uio_resid & (secsize - 1)) == 0) { 432 433 /* Sync dirty pages and buffers if needed */ 434 error = ffs_rawread_sync(vp); 435 if (error != 0) 436 return error; 437 438 /* Check for end of file */ 439 if (ip->i_size > uio->uio_offset) { 440 filebytes = ip->i_size - uio->uio_offset; 441 442 /* No special eof handling needed ? */ 443 if (uio->uio_resid <= filebytes) { 444 *workdone = 1; 445 return ffs_rawread_main(vp, uio); 446 } 447 448 partialbytes = ((unsigned int) ip->i_size) % 449 ITOFS(ip)->fs_bsize; 450 blockbytes = (int) filebytes - partialbytes; 451 if (blockbytes > 0) { 452 skipbytes = uio->uio_resid - 453 blockbytes; 454 uio->uio_resid = blockbytes; 455 error = ffs_rawread_main(vp, uio); 456 uio->uio_resid += skipbytes; 457 if (error != 0) 458 return error; 459 /* Read remaining part using buffer */ 460 } 461 } 462 } 463 } 464 *workdone = 0; 465 return 0; 466 } 467