1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2000-2003 Tor Egge 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/systm.h> 30 #include <sys/bio.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/fcntl.h> 34 #include <sys/kernel.h> 35 #include <sys/limits.h> 36 #include <sys/mount.h> 37 #include <sys/namei.h> 38 #include <sys/proc.h> 39 #include <sys/rwlock.h> 40 #include <sys/stat.h> 41 #include <sys/sysctl.h> 42 #include <sys/vnode.h> 43 44 #include <ufs/ufs/extattr.h> 45 #include <ufs/ufs/quota.h> 46 #include <ufs/ufs/inode.h> 47 #include <ufs/ufs/ufsmount.h> 48 #include <ufs/ufs/ufs_extern.h> 49 #include <ufs/ffs/fs.h> 50 #include <ufs/ffs/ffs_extern.h> 51 52 #include <vm/vm.h> 53 #include <vm/vm_extern.h> 54 #include <vm/vm_object.h> 55 56 static int ffs_rawread_readahead(struct vnode *vp, 57 caddr_t udata, 58 off_t offset, 59 size_t len, 60 struct thread *td, 61 struct buf *bp); 62 static int ffs_rawread_main(struct vnode *vp, 63 struct uio *uio); 64 65 static int ffs_rawread_sync(struct vnode *vp); 66 67 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 68 69 SYSCTL_DECL(_vfs_ffs); 70 71 static uma_zone_t ffsraw_pbuf_zone; 72 73 static int allowrawread = 1; 74 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 75 "Flag to enable raw reads"); 76 77 static int rawreadahead = 1; 78 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 79 "Flag to enable readahead for long raw reads"); 80 81 static void 82 ffs_rawread_setup(void *arg __unused) 83 { 84 85 ffsraw_pbuf_zone = pbuf_zsecond_create("ffsrawpbuf", 86 (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8); 87 } 88 SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); 89 90 static int 91 ffs_rawread_sync(struct vnode *vp) 92 { 93 int error; 94 int upgraded; 95 struct bufobj *bo; 96 struct mount *mp; 97 vm_object_t obj; 98 99 /* Check for dirty mmap, pending writes and dirty buffers */ 100 bo = &vp->v_bufobj; 101 BO_LOCK(bo); 102 VI_LOCK(vp); 103 if (bo->bo_numoutput > 0 || 104 bo->bo_dirty.bv_cnt > 0 || 105 ((obj = vp->v_object) != NULL && 106 vm_object_mightbedirty(obj))) { 107 VI_UNLOCK(vp); 108 BO_UNLOCK(bo); 109 110 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 111 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 112 upgraded = 1; 113 else 114 upgraded = 0; 115 VOP_UNLOCK(vp); 116 (void) vn_start_write(vp, &mp, V_WAIT); 117 VOP_LOCK(vp, LK_EXCLUSIVE); 118 } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 119 upgraded = 1; 120 /* Upgrade to exclusive lock, this might block */ 121 VOP_LOCK(vp, LK_UPGRADE); 122 } else 123 upgraded = 0; 124 125 126 VI_LOCK(vp); 127 /* Check if vnode was reclaimed while unlocked. */ 128 if (VN_IS_DOOMED(vp)) { 129 VI_UNLOCK(vp); 130 if (upgraded != 0) 131 VOP_LOCK(vp, LK_DOWNGRADE); 132 vn_finished_write(mp); 133 return (EIO); 134 } 135 /* Attempt to msync mmap() regions to clean dirty mmap */ 136 if ((obj = vp->v_object) != NULL && 137 vm_object_mightbedirty(obj)) { 138 VI_UNLOCK(vp); 139 VM_OBJECT_WLOCK(obj); 140 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 141 VM_OBJECT_WUNLOCK(obj); 142 } else 143 VI_UNLOCK(vp); 144 145 /* Wait for pending writes to complete */ 146 BO_LOCK(bo); 147 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 148 if (error != 0) { 149 /* XXX: can't happen with a zero timeout ??? */ 150 BO_UNLOCK(bo); 151 if (upgraded != 0) 152 VOP_LOCK(vp, LK_DOWNGRADE); 153 vn_finished_write(mp); 154 return (error); 155 } 156 /* Flush dirty buffers */ 157 if (bo->bo_dirty.bv_cnt > 0) { 158 BO_UNLOCK(bo); 159 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { 160 if (upgraded != 0) 161 VOP_LOCK(vp, LK_DOWNGRADE); 162 vn_finished_write(mp); 163 return (error); 164 } 165 BO_LOCK(bo); 166 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 167 panic("ffs_rawread_sync: dirty bufs"); 168 } 169 BO_UNLOCK(bo); 170 if (upgraded != 0) 171 VOP_LOCK(vp, LK_DOWNGRADE); 172 vn_finished_write(mp); 173 } else { 174 VI_UNLOCK(vp); 175 BO_UNLOCK(bo); 176 } 177 return 0; 178 } 179 180 static int 181 ffs_rawread_readahead(struct vnode *vp, 182 caddr_t udata, 183 off_t offset, 184 size_t len, 185 struct thread *td, 186 struct buf *bp) 187 { 188 int error; 189 uint64_t iolen; 190 off_t blockno; 191 int blockoff; 192 int bsize; 193 struct vnode *dp; 194 int bforwards; 195 struct inode *ip; 196 ufs2_daddr_t blkno; 197 198 bsize = vp->v_mount->mnt_stat.f_iosize; 199 200 ip = VTOI(vp); 201 dp = ITODEVVP(ip); 202 203 iolen = ((vm_offset_t) udata) & PAGE_MASK; 204 bp->b_bcount = len; 205 if (bp->b_bcount + iolen > bp->b_kvasize) { 206 bp->b_bcount = bp->b_kvasize; 207 if (iolen != 0) 208 bp->b_bcount -= PAGE_SIZE; 209 } 210 bp->b_flags = 0; /* XXX necessary ? */ 211 bp->b_iocmd = BIO_READ; 212 bp->b_iodone = bdone; 213 blockno = offset / bsize; 214 blockoff = (offset % bsize) / DEV_BSIZE; 215 if ((daddr_t) blockno != blockno) { 216 return EINVAL; /* blockno overflow */ 217 } 218 219 bp->b_lblkno = bp->b_blkno = blockno; 220 221 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 222 if (error != 0) 223 return error; 224 if (blkno == -1) { 225 /* Fill holes with NULs to preserve semantics */ 226 227 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 228 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 229 230 if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0) 231 return EFAULT; 232 233 maybe_yield(); 234 bzero(bp->b_data, bp->b_bufsize); 235 236 /* Mark operation completed (similar to bufdone()) */ 237 238 bp->b_resid = 0; 239 bp->b_flags |= B_DONE; 240 return 0; 241 } 242 bp->b_blkno = blkno + blockoff; 243 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 244 245 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 246 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 247 248 if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0) 249 return EFAULT; 250 251 BO_STRATEGY(&dp->v_bufobj, bp); 252 return 0; 253 } 254 255 static int 256 ffs_rawread_main(struct vnode *vp, 257 struct uio *uio) 258 { 259 int error, nerror; 260 struct buf *bp, *nbp, *tbp; 261 uint64_t iolen; 262 caddr_t udata; 263 long resid; 264 off_t offset; 265 struct thread *td; 266 267 td = uio->uio_td ? uio->uio_td : curthread; 268 udata = uio->uio_iov->iov_base; 269 resid = uio->uio_resid; 270 offset = uio->uio_offset; 271 272 /* 273 * keep the process from being swapped 274 */ 275 PHOLD(td->td_proc); 276 277 error = 0; 278 nerror = 0; 279 280 bp = NULL; 281 nbp = NULL; 282 283 while (resid > 0) { 284 285 if (bp == NULL) { /* Setup first read */ 286 bp = uma_zalloc(ffsraw_pbuf_zone, M_WAITOK); 287 pbgetvp(vp, bp); 288 error = ffs_rawread_readahead(vp, udata, offset, 289 resid, td, bp); 290 if (error != 0) 291 break; 292 293 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 294 if (rawreadahead != 0) 295 nbp = uma_zalloc(ffsraw_pbuf_zone, 296 M_NOWAIT); 297 else 298 nbp = NULL; 299 if (nbp != NULL) { 300 pbgetvp(vp, nbp); 301 302 nerror = ffs_rawread_readahead(vp, 303 udata + 304 bp->b_bufsize, 305 offset + 306 bp->b_bufsize, 307 resid - 308 bp->b_bufsize, 309 td, 310 nbp); 311 if (nerror) { 312 pbrelvp(nbp); 313 uma_zfree(ffsraw_pbuf_zone, 314 nbp); 315 nbp = NULL; 316 } 317 } 318 } 319 } 320 321 bwait(bp, PRIBIO, "rawrd"); 322 vunmapbuf(bp); 323 324 iolen = bp->b_bcount - bp->b_resid; 325 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 326 nerror = 0; /* Ignore possible beyond EOF error */ 327 break; /* EOF */ 328 } 329 330 if ((bp->b_ioflags & BIO_ERROR) != 0) { 331 error = bp->b_error; 332 break; 333 } 334 resid -= iolen; 335 udata += iolen; 336 offset += iolen; 337 if (iolen < bp->b_bufsize) { 338 /* Incomplete read. Try to read remaining part */ 339 error = ffs_rawread_readahead(vp, 340 udata, 341 offset, 342 bp->b_bufsize - iolen, 343 td, 344 bp); 345 if (error != 0) 346 break; 347 } else if (nbp != NULL) { /* Complete read with readahead */ 348 349 tbp = bp; 350 bp = nbp; 351 nbp = tbp; 352 353 if (resid <= bp->b_bufsize) { /* No more readaheads */ 354 pbrelvp(nbp); 355 uma_zfree(ffsraw_pbuf_zone, nbp); 356 nbp = NULL; 357 } else { /* Setup next readahead */ 358 nerror = ffs_rawread_readahead(vp, 359 udata + 360 bp->b_bufsize, 361 offset + 362 bp->b_bufsize, 363 resid - 364 bp->b_bufsize, 365 td, 366 nbp); 367 if (nerror != 0) { 368 pbrelvp(nbp); 369 uma_zfree(ffsraw_pbuf_zone, nbp); 370 nbp = NULL; 371 } 372 } 373 } else if (nerror != 0) {/* Deferred Readahead error */ 374 break; 375 } else if (resid > 0) { /* More to read, no readahead */ 376 error = ffs_rawread_readahead(vp, udata, offset, 377 resid, td, bp); 378 if (error != 0) 379 break; 380 } 381 } 382 383 if (bp != NULL) { 384 pbrelvp(bp); 385 uma_zfree(ffsraw_pbuf_zone, bp); 386 } 387 if (nbp != NULL) { /* Run down readahead buffer */ 388 bwait(nbp, PRIBIO, "rawrd"); 389 vunmapbuf(nbp); 390 pbrelvp(nbp); 391 uma_zfree(ffsraw_pbuf_zone, nbp); 392 } 393 394 if (error == 0) 395 error = nerror; 396 PRELE(td->td_proc); 397 uio->uio_iov->iov_base = udata; 398 uio->uio_resid = resid; 399 uio->uio_offset = offset; 400 return error; 401 } 402 403 int 404 ffs_rawread(struct vnode *vp, 405 struct uio *uio, 406 int *workdone) 407 { 408 if (allowrawread != 0 && 409 uio->uio_iovcnt == 1 && 410 uio->uio_segflg == UIO_USERSPACE && 411 uio->uio_resid == uio->uio_iov->iov_len && 412 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 413 TDP_DEADLKTREAT) == 0) { 414 int secsize; /* Media sector size */ 415 off_t filebytes; /* Bytes left of file */ 416 int blockbytes; /* Bytes left of file in full blocks */ 417 int partialbytes; /* Bytes in last partial block */ 418 int skipbytes; /* Bytes not to read in ffs_rawread */ 419 struct inode *ip; 420 int error; 421 422 423 /* Only handle sector aligned reads */ 424 ip = VTOI(vp); 425 secsize = ITODEVVP(ip)->v_bufobj.bo_bsize; 426 if ((uio->uio_offset & (secsize - 1)) == 0 && 427 (uio->uio_resid & (secsize - 1)) == 0) { 428 429 /* Sync dirty pages and buffers if needed */ 430 error = ffs_rawread_sync(vp); 431 if (error != 0) 432 return error; 433 434 /* Check for end of file */ 435 if (ip->i_size > uio->uio_offset) { 436 filebytes = ip->i_size - uio->uio_offset; 437 438 /* No special eof handling needed ? */ 439 if (uio->uio_resid <= filebytes) { 440 *workdone = 1; 441 return ffs_rawread_main(vp, uio); 442 } 443 444 partialbytes = ((unsigned int) ip->i_size) % 445 ITOFS(ip)->fs_bsize; 446 blockbytes = (int) filebytes - partialbytes; 447 if (blockbytes > 0) { 448 skipbytes = uio->uio_resid - 449 blockbytes; 450 uio->uio_resid = blockbytes; 451 error = ffs_rawread_main(vp, uio); 452 uio->uio_resid += skipbytes; 453 if (error != 0) 454 return error; 455 /* Read remaining part using buffer */ 456 } 457 } 458 } 459 } 460 *workdone = 0; 461 return 0; 462 } 463