1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2000-2003 Tor Egge 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/fcntl.h> 32 #include <sys/file.h> 33 #include <sys/stat.h> 34 #include <sys/proc.h> 35 #include <sys/limits.h> 36 #include <sys/mount.h> 37 #include <sys/namei.h> 38 #include <sys/vnode.h> 39 #include <sys/conf.h> 40 #include <sys/filio.h> 41 #include <sys/ttycom.h> 42 #include <sys/bio.h> 43 #include <sys/buf.h> 44 #include <sys/rwlock.h> 45 #include <ufs/ufs/extattr.h> 46 #include <ufs/ufs/quota.h> 47 #include <ufs/ufs/inode.h> 48 #include <ufs/ufs/ufsmount.h> 49 #include <ufs/ufs/ufs_extern.h> 50 #include <ufs/ffs/fs.h> 51 #include <ufs/ffs/ffs_extern.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_extern.h> 55 #include <vm/vm_object.h> 56 #include <sys/kernel.h> 57 #include <sys/sysctl.h> 58 59 static int ffs_rawread_readahead(struct vnode *vp, 60 caddr_t udata, 61 off_t offset, 62 size_t len, 63 struct thread *td, 64 struct buf *bp); 65 static int ffs_rawread_main(struct vnode *vp, 66 struct uio *uio); 67 68 static int ffs_rawread_sync(struct vnode *vp); 69 70 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 71 72 SYSCTL_DECL(_vfs_ffs); 73 74 static uma_zone_t ffsraw_pbuf_zone; 75 76 static int allowrawread = 1; 77 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 78 "Flag to enable raw reads"); 79 80 static int rawreadahead = 1; 81 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 82 "Flag to enable readahead for long raw reads"); 83 84 static void 85 ffs_rawread_setup(void *arg __unused) 86 { 87 88 ffsraw_pbuf_zone = pbuf_zsecond_create("ffsrawpbuf", 89 (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8); 90 } 91 SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); 92 93 static int 94 ffs_rawread_sync(struct vnode *vp) 95 { 96 int error; 97 int upgraded; 98 struct bufobj *bo; 99 struct mount *mp; 100 vm_object_t obj; 101 102 /* Check for dirty mmap, pending writes and dirty buffers */ 103 bo = &vp->v_bufobj; 104 BO_LOCK(bo); 105 VI_LOCK(vp); 106 if (bo->bo_numoutput > 0 || 107 bo->bo_dirty.bv_cnt > 0 || 108 ((obj = vp->v_object) != NULL && 109 vm_object_mightbedirty(obj))) { 110 VI_UNLOCK(vp); 111 BO_UNLOCK(bo); 112 113 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 114 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 115 upgraded = 1; 116 else 117 upgraded = 0; 118 VOP_UNLOCK(vp); 119 (void) vn_start_write(vp, &mp, V_WAIT); 120 VOP_LOCK(vp, LK_EXCLUSIVE); 121 } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 122 upgraded = 1; 123 /* Upgrade to exclusive lock, this might block */ 124 VOP_LOCK(vp, LK_UPGRADE); 125 } else 126 upgraded = 0; 127 128 129 VI_LOCK(vp); 130 /* Check if vnode was reclaimed while unlocked. */ 131 if (VN_IS_DOOMED(vp)) { 132 VI_UNLOCK(vp); 133 if (upgraded != 0) 134 VOP_LOCK(vp, LK_DOWNGRADE); 135 vn_finished_write(mp); 136 return (EIO); 137 } 138 /* Attempt to msync mmap() regions to clean dirty mmap */ 139 if ((obj = vp->v_object) != NULL && 140 vm_object_mightbedirty(obj)) { 141 VI_UNLOCK(vp); 142 VM_OBJECT_WLOCK(obj); 143 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 144 VM_OBJECT_WUNLOCK(obj); 145 } else 146 VI_UNLOCK(vp); 147 148 /* Wait for pending writes to complete */ 149 BO_LOCK(bo); 150 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 151 if (error != 0) { 152 /* XXX: can't happen with a zero timeout ??? */ 153 BO_UNLOCK(bo); 154 if (upgraded != 0) 155 VOP_LOCK(vp, LK_DOWNGRADE); 156 vn_finished_write(mp); 157 return (error); 158 } 159 /* Flush dirty buffers */ 160 if (bo->bo_dirty.bv_cnt > 0) { 161 BO_UNLOCK(bo); 162 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { 163 if (upgraded != 0) 164 VOP_LOCK(vp, LK_DOWNGRADE); 165 vn_finished_write(mp); 166 return (error); 167 } 168 BO_LOCK(bo); 169 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 170 panic("ffs_rawread_sync: dirty bufs"); 171 } 172 BO_UNLOCK(bo); 173 if (upgraded != 0) 174 VOP_LOCK(vp, LK_DOWNGRADE); 175 vn_finished_write(mp); 176 } else { 177 VI_UNLOCK(vp); 178 BO_UNLOCK(bo); 179 } 180 return 0; 181 } 182 183 static int 184 ffs_rawread_readahead(struct vnode *vp, 185 caddr_t udata, 186 off_t offset, 187 size_t len, 188 struct thread *td, 189 struct buf *bp) 190 { 191 int error; 192 uint64_t iolen; 193 off_t blockno; 194 int blockoff; 195 int bsize; 196 struct vnode *dp; 197 int bforwards; 198 struct inode *ip; 199 ufs2_daddr_t blkno; 200 201 bsize = vp->v_mount->mnt_stat.f_iosize; 202 203 ip = VTOI(vp); 204 dp = ITODEVVP(ip); 205 206 iolen = ((vm_offset_t) udata) & PAGE_MASK; 207 bp->b_bcount = len; 208 if (bp->b_bcount + iolen > bp->b_kvasize) { 209 bp->b_bcount = bp->b_kvasize; 210 if (iolen != 0) 211 bp->b_bcount -= PAGE_SIZE; 212 } 213 bp->b_flags = 0; /* XXX necessary ? */ 214 bp->b_iocmd = BIO_READ; 215 bp->b_iodone = bdone; 216 blockno = offset / bsize; 217 blockoff = (offset % bsize) / DEV_BSIZE; 218 if ((daddr_t) blockno != blockno) { 219 return EINVAL; /* blockno overflow */ 220 } 221 222 bp->b_lblkno = bp->b_blkno = blockno; 223 224 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 225 if (error != 0) 226 return error; 227 if (blkno == -1) { 228 /* Fill holes with NULs to preserve semantics */ 229 230 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 231 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 232 233 if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0) 234 return EFAULT; 235 236 maybe_yield(); 237 bzero(bp->b_data, bp->b_bufsize); 238 239 /* Mark operation completed (similar to bufdone()) */ 240 241 bp->b_resid = 0; 242 bp->b_flags |= B_DONE; 243 return 0; 244 } 245 bp->b_blkno = blkno + blockoff; 246 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 247 248 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 249 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 250 251 if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0) 252 return EFAULT; 253 254 BO_STRATEGY(&dp->v_bufobj, bp); 255 return 0; 256 } 257 258 static int 259 ffs_rawread_main(struct vnode *vp, 260 struct uio *uio) 261 { 262 int error, nerror; 263 struct buf *bp, *nbp, *tbp; 264 uint64_t iolen; 265 caddr_t udata; 266 long resid; 267 off_t offset; 268 struct thread *td; 269 270 td = uio->uio_td ? uio->uio_td : curthread; 271 udata = uio->uio_iov->iov_base; 272 resid = uio->uio_resid; 273 offset = uio->uio_offset; 274 275 /* 276 * keep the process from being swapped 277 */ 278 PHOLD(td->td_proc); 279 280 error = 0; 281 nerror = 0; 282 283 bp = NULL; 284 nbp = NULL; 285 286 while (resid > 0) { 287 288 if (bp == NULL) { /* Setup first read */ 289 bp = uma_zalloc(ffsraw_pbuf_zone, M_WAITOK); 290 pbgetvp(vp, bp); 291 error = ffs_rawread_readahead(vp, udata, offset, 292 resid, td, bp); 293 if (error != 0) 294 break; 295 296 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 297 if (rawreadahead != 0) 298 nbp = uma_zalloc(ffsraw_pbuf_zone, 299 M_NOWAIT); 300 else 301 nbp = NULL; 302 if (nbp != NULL) { 303 pbgetvp(vp, nbp); 304 305 nerror = ffs_rawread_readahead(vp, 306 udata + 307 bp->b_bufsize, 308 offset + 309 bp->b_bufsize, 310 resid - 311 bp->b_bufsize, 312 td, 313 nbp); 314 if (nerror) { 315 pbrelvp(nbp); 316 uma_zfree(ffsraw_pbuf_zone, 317 nbp); 318 nbp = NULL; 319 } 320 } 321 } 322 } 323 324 bwait(bp, PRIBIO, "rawrd"); 325 vunmapbuf(bp); 326 327 iolen = bp->b_bcount - bp->b_resid; 328 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 329 nerror = 0; /* Ignore possible beyond EOF error */ 330 break; /* EOF */ 331 } 332 333 if ((bp->b_ioflags & BIO_ERROR) != 0) { 334 error = bp->b_error; 335 break; 336 } 337 resid -= iolen; 338 udata += iolen; 339 offset += iolen; 340 if (iolen < bp->b_bufsize) { 341 /* Incomplete read. Try to read remaining part */ 342 error = ffs_rawread_readahead(vp, 343 udata, 344 offset, 345 bp->b_bufsize - iolen, 346 td, 347 bp); 348 if (error != 0) 349 break; 350 } else if (nbp != NULL) { /* Complete read with readahead */ 351 352 tbp = bp; 353 bp = nbp; 354 nbp = tbp; 355 356 if (resid <= bp->b_bufsize) { /* No more readaheads */ 357 pbrelvp(nbp); 358 uma_zfree(ffsraw_pbuf_zone, nbp); 359 nbp = NULL; 360 } else { /* Setup next readahead */ 361 nerror = ffs_rawread_readahead(vp, 362 udata + 363 bp->b_bufsize, 364 offset + 365 bp->b_bufsize, 366 resid - 367 bp->b_bufsize, 368 td, 369 nbp); 370 if (nerror != 0) { 371 pbrelvp(nbp); 372 uma_zfree(ffsraw_pbuf_zone, nbp); 373 nbp = NULL; 374 } 375 } 376 } else if (nerror != 0) {/* Deferred Readahead error */ 377 break; 378 } else if (resid > 0) { /* More to read, no readahead */ 379 error = ffs_rawread_readahead(vp, udata, offset, 380 resid, td, bp); 381 if (error != 0) 382 break; 383 } 384 } 385 386 if (bp != NULL) { 387 pbrelvp(bp); 388 uma_zfree(ffsraw_pbuf_zone, bp); 389 } 390 if (nbp != NULL) { /* Run down readahead buffer */ 391 bwait(nbp, PRIBIO, "rawrd"); 392 vunmapbuf(nbp); 393 pbrelvp(nbp); 394 uma_zfree(ffsraw_pbuf_zone, nbp); 395 } 396 397 if (error == 0) 398 error = nerror; 399 PRELE(td->td_proc); 400 uio->uio_iov->iov_base = udata; 401 uio->uio_resid = resid; 402 uio->uio_offset = offset; 403 return error; 404 } 405 406 int 407 ffs_rawread(struct vnode *vp, 408 struct uio *uio, 409 int *workdone) 410 { 411 if (allowrawread != 0 && 412 uio->uio_iovcnt == 1 && 413 uio->uio_segflg == UIO_USERSPACE && 414 uio->uio_resid == uio->uio_iov->iov_len && 415 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 416 TDP_DEADLKTREAT) == 0) { 417 int secsize; /* Media sector size */ 418 off_t filebytes; /* Bytes left of file */ 419 int blockbytes; /* Bytes left of file in full blocks */ 420 int partialbytes; /* Bytes in last partial block */ 421 int skipbytes; /* Bytes not to read in ffs_rawread */ 422 struct inode *ip; 423 int error; 424 425 426 /* Only handle sector aligned reads */ 427 ip = VTOI(vp); 428 secsize = ITODEVVP(ip)->v_bufobj.bo_bsize; 429 if ((uio->uio_offset & (secsize - 1)) == 0 && 430 (uio->uio_resid & (secsize - 1)) == 0) { 431 432 /* Sync dirty pages and buffers if needed */ 433 error = ffs_rawread_sync(vp); 434 if (error != 0) 435 return error; 436 437 /* Check for end of file */ 438 if (ip->i_size > uio->uio_offset) { 439 filebytes = ip->i_size - uio->uio_offset; 440 441 /* No special eof handling needed ? */ 442 if (uio->uio_resid <= filebytes) { 443 *workdone = 1; 444 return ffs_rawread_main(vp, uio); 445 } 446 447 partialbytes = ((unsigned int) ip->i_size) % 448 ITOFS(ip)->fs_bsize; 449 blockbytes = (int) filebytes - partialbytes; 450 if (blockbytes > 0) { 451 skipbytes = uio->uio_resid - 452 blockbytes; 453 uio->uio_resid = blockbytes; 454 error = ffs_rawread_main(vp, uio); 455 uio->uio_resid += skipbytes; 456 if (error != 0) 457 return error; 458 /* Read remaining part using buffer */ 459 } 460 } 461 } 462 } 463 *workdone = 0; 464 return 0; 465 } 466