1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2000-2003 Tor Egge 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/systm.h> 30 #include <sys/bio.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/fcntl.h> 34 #include <sys/kernel.h> 35 #include <sys/limits.h> 36 #include <sys/mount.h> 37 #include <sys/namei.h> 38 #include <sys/proc.h> 39 #include <sys/rwlock.h> 40 #include <sys/stat.h> 41 #include <sys/sysctl.h> 42 #include <sys/vnode.h> 43 44 #include <ufs/ufs/extattr.h> 45 #include <ufs/ufs/quota.h> 46 #include <ufs/ufs/inode.h> 47 #include <ufs/ufs/ufsmount.h> 48 #include <ufs/ufs/ufs_extern.h> 49 #include <ufs/ffs/fs.h> 50 #include <ufs/ffs/ffs_extern.h> 51 52 #include <vm/vm.h> 53 #include <vm/vm_extern.h> 54 #include <vm/vm_object.h> 55 #include <vm/vnode_pager.h> 56 57 static int ffs_rawread_readahead(struct vnode *vp, 58 caddr_t udata, 59 off_t offset, 60 size_t len, 61 struct thread *td, 62 struct buf *bp); 63 static int ffs_rawread_main(struct vnode *vp, 64 struct uio *uio); 65 66 static int ffs_rawread_sync(struct vnode *vp); 67 68 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 69 70 SYSCTL_DECL(_vfs_ffs); 71 72 static uma_zone_t ffsraw_pbuf_zone; 73 74 static int allowrawread = 1; 75 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 76 "Flag to enable raw reads"); 77 78 static int rawreadahead = 1; 79 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 80 "Flag to enable readahead for long raw reads"); 81 82 static void 83 ffs_rawread_setup(void *arg __unused) 84 { 85 86 ffsraw_pbuf_zone = pbuf_zsecond_create("ffsrawpbuf", 87 (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8); 88 } 89 SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); 90 91 static int 92 ffs_rawread_sync(struct vnode *vp) 93 { 94 int error; 95 int upgraded; 96 struct bufobj *bo; 97 struct mount *mp; 98 vm_object_t obj; 99 100 /* Check for dirty mmap, pending writes and dirty buffers */ 101 bo = &vp->v_bufobj; 102 BO_LOCK(bo); 103 VI_LOCK(vp); 104 if (bo->bo_numoutput > 0 || 105 bo->bo_dirty.bv_cnt > 0 || 106 ((obj = vp->v_object) != NULL && 107 vm_object_mightbedirty(obj))) { 108 VI_UNLOCK(vp); 109 BO_UNLOCK(bo); 110 111 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 112 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 113 upgraded = 1; 114 else 115 upgraded = 0; 116 VOP_UNLOCK(vp); 117 (void) vn_start_write(vp, &mp, V_WAIT); 118 VOP_LOCK(vp, LK_EXCLUSIVE); 119 } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 120 upgraded = 1; 121 /* Upgrade to exclusive lock, this might block */ 122 VOP_LOCK(vp, LK_UPGRADE); 123 } else 124 upgraded = 0; 125 126 127 VI_LOCK(vp); 128 /* Check if vnode was reclaimed while unlocked. */ 129 if (VN_IS_DOOMED(vp)) { 130 VI_UNLOCK(vp); 131 if (upgraded != 0) 132 VOP_LOCK(vp, LK_DOWNGRADE); 133 vn_finished_write(mp); 134 return (EIO); 135 } 136 VI_UNLOCK(vp); 137 138 /* Attempt to msync mmap() regions to clean dirty mmap */ 139 vnode_pager_clean_sync(vp); 140 141 /* Wait for pending writes to complete */ 142 BO_LOCK(bo); 143 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 144 if (error != 0) { 145 /* XXX: can't happen with a zero timeout ??? */ 146 BO_UNLOCK(bo); 147 if (upgraded != 0) 148 VOP_LOCK(vp, LK_DOWNGRADE); 149 vn_finished_write(mp); 150 return (error); 151 } 152 /* Flush dirty buffers */ 153 if (bo->bo_dirty.bv_cnt > 0) { 154 BO_UNLOCK(bo); 155 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { 156 if (upgraded != 0) 157 VOP_LOCK(vp, LK_DOWNGRADE); 158 vn_finished_write(mp); 159 return (error); 160 } 161 BO_LOCK(bo); 162 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 163 panic("ffs_rawread_sync: dirty bufs"); 164 } 165 BO_UNLOCK(bo); 166 if (upgraded != 0) 167 VOP_LOCK(vp, LK_DOWNGRADE); 168 vn_finished_write(mp); 169 } else { 170 VI_UNLOCK(vp); 171 BO_UNLOCK(bo); 172 } 173 return 0; 174 } 175 176 static int 177 ffs_rawread_readahead(struct vnode *vp, 178 caddr_t udata, 179 off_t offset, 180 size_t len, 181 struct thread *td, 182 struct buf *bp) 183 { 184 int error; 185 uint64_t iolen; 186 off_t blockno; 187 int blockoff; 188 int bsize; 189 struct vnode *dp; 190 int bforwards; 191 struct inode *ip; 192 ufs2_daddr_t blkno; 193 194 bsize = vp->v_mount->mnt_stat.f_iosize; 195 196 ip = VTOI(vp); 197 dp = ITODEVVP(ip); 198 199 iolen = ((vm_offset_t) udata) & PAGE_MASK; 200 bp->b_bcount = len; 201 if (bp->b_bcount + iolen > bp->b_kvasize) { 202 bp->b_bcount = bp->b_kvasize; 203 if (iolen != 0) 204 bp->b_bcount -= PAGE_SIZE; 205 } 206 bp->b_flags = 0; /* XXX necessary ? */ 207 bp->b_iocmd = BIO_READ; 208 bp->b_iodone = bdone; 209 blockno = offset / bsize; 210 blockoff = (offset % bsize) / DEV_BSIZE; 211 if ((daddr_t) blockno != blockno) { 212 return EINVAL; /* blockno overflow */ 213 } 214 215 bp->b_lblkno = bp->b_blkno = blockno; 216 217 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 218 if (error != 0) 219 return error; 220 if (blkno == -1) { 221 /* Fill holes with NULs to preserve semantics */ 222 223 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 224 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 225 226 if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0) 227 return EFAULT; 228 229 maybe_yield(); 230 bzero(bp->b_data, bp->b_bufsize); 231 232 /* Mark operation completed (similar to bufdone()) */ 233 234 bp->b_resid = 0; 235 bp->b_flags |= B_DONE; 236 return 0; 237 } 238 bp->b_blkno = blkno + blockoff; 239 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 240 241 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 242 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 243 244 if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0) 245 return EFAULT; 246 247 BO_STRATEGY(&dp->v_bufobj, bp); 248 return 0; 249 } 250 251 static int 252 ffs_rawread_main(struct vnode *vp, 253 struct uio *uio) 254 { 255 int error, nerror; 256 struct buf *bp, *nbp, *tbp; 257 uint64_t iolen; 258 caddr_t udata; 259 long resid; 260 off_t offset; 261 struct thread *td; 262 263 td = uio->uio_td ? uio->uio_td : curthread; 264 udata = uio->uio_iov->iov_base; 265 resid = uio->uio_resid; 266 offset = uio->uio_offset; 267 268 /* 269 * keep the process from being swapped 270 */ 271 PHOLD(td->td_proc); 272 273 error = 0; 274 nerror = 0; 275 276 bp = NULL; 277 nbp = NULL; 278 279 while (resid > 0) { 280 281 if (bp == NULL) { /* Setup first read */ 282 bp = uma_zalloc(ffsraw_pbuf_zone, M_WAITOK); 283 pbgetvp(vp, bp); 284 error = ffs_rawread_readahead(vp, udata, offset, 285 resid, td, bp); 286 if (error != 0) 287 break; 288 289 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 290 if (rawreadahead != 0) 291 nbp = uma_zalloc(ffsraw_pbuf_zone, 292 M_NOWAIT); 293 else 294 nbp = NULL; 295 if (nbp != NULL) { 296 pbgetvp(vp, nbp); 297 298 nerror = ffs_rawread_readahead(vp, 299 udata + 300 bp->b_bufsize, 301 offset + 302 bp->b_bufsize, 303 resid - 304 bp->b_bufsize, 305 td, 306 nbp); 307 if (nerror) { 308 pbrelvp(nbp); 309 uma_zfree(ffsraw_pbuf_zone, 310 nbp); 311 nbp = NULL; 312 } 313 } 314 } 315 } 316 317 bwait(bp, PRIBIO, "rawrd"); 318 vunmapbuf(bp); 319 320 iolen = bp->b_bcount - bp->b_resid; 321 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 322 nerror = 0; /* Ignore possible beyond EOF error */ 323 break; /* EOF */ 324 } 325 326 if ((bp->b_ioflags & BIO_ERROR) != 0) { 327 error = bp->b_error; 328 break; 329 } 330 resid -= iolen; 331 udata += iolen; 332 offset += iolen; 333 if (iolen < bp->b_bufsize) { 334 /* Incomplete read. Try to read remaining part */ 335 error = ffs_rawread_readahead(vp, 336 udata, 337 offset, 338 bp->b_bufsize - iolen, 339 td, 340 bp); 341 if (error != 0) 342 break; 343 } else if (nbp != NULL) { /* Complete read with readahead */ 344 345 tbp = bp; 346 bp = nbp; 347 nbp = tbp; 348 349 if (resid <= bp->b_bufsize) { /* No more readaheads */ 350 pbrelvp(nbp); 351 uma_zfree(ffsraw_pbuf_zone, nbp); 352 nbp = NULL; 353 } else { /* Setup next readahead */ 354 nerror = ffs_rawread_readahead(vp, 355 udata + 356 bp->b_bufsize, 357 offset + 358 bp->b_bufsize, 359 resid - 360 bp->b_bufsize, 361 td, 362 nbp); 363 if (nerror != 0) { 364 pbrelvp(nbp); 365 uma_zfree(ffsraw_pbuf_zone, nbp); 366 nbp = NULL; 367 } 368 } 369 } else if (nerror != 0) {/* Deferred Readahead error */ 370 break; 371 } else if (resid > 0) { /* More to read, no readahead */ 372 error = ffs_rawread_readahead(vp, udata, offset, 373 resid, td, bp); 374 if (error != 0) 375 break; 376 } 377 } 378 379 if (bp != NULL) { 380 pbrelvp(bp); 381 uma_zfree(ffsraw_pbuf_zone, bp); 382 } 383 if (nbp != NULL) { /* Run down readahead buffer */ 384 bwait(nbp, PRIBIO, "rawrd"); 385 vunmapbuf(nbp); 386 pbrelvp(nbp); 387 uma_zfree(ffsraw_pbuf_zone, nbp); 388 } 389 390 if (error == 0) 391 error = nerror; 392 PRELE(td->td_proc); 393 uio->uio_iov->iov_base = udata; 394 uio->uio_resid = resid; 395 uio->uio_offset = offset; 396 return error; 397 } 398 399 int 400 ffs_rawread(struct vnode *vp, 401 struct uio *uio, 402 int *workdone) 403 { 404 if (allowrawread != 0 && 405 uio->uio_iovcnt == 1 && 406 uio->uio_segflg == UIO_USERSPACE && 407 uio->uio_resid == uio->uio_iov->iov_len && 408 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 409 TDP_DEADLKTREAT) == 0) { 410 int secsize; /* Media sector size */ 411 off_t filebytes; /* Bytes left of file */ 412 int blockbytes; /* Bytes left of file in full blocks */ 413 int partialbytes; /* Bytes in last partial block */ 414 int skipbytes; /* Bytes not to read in ffs_rawread */ 415 struct inode *ip; 416 int error; 417 418 419 /* Only handle sector aligned reads */ 420 ip = VTOI(vp); 421 secsize = ITODEVVP(ip)->v_bufobj.bo_bsize; 422 if ((uio->uio_offset & (secsize - 1)) == 0 && 423 (uio->uio_resid & (secsize - 1)) == 0) { 424 425 /* Sync dirty pages and buffers if needed */ 426 error = ffs_rawread_sync(vp); 427 if (error != 0) 428 return error; 429 430 /* Check for end of file */ 431 if (ip->i_size > uio->uio_offset) { 432 filebytes = ip->i_size - uio->uio_offset; 433 434 /* No special eof handling needed ? */ 435 if (uio->uio_resid <= filebytes) { 436 *workdone = 1; 437 return ffs_rawread_main(vp, uio); 438 } 439 440 partialbytes = ((unsigned int) ip->i_size) % 441 ITOFS(ip)->fs_bsize; 442 blockbytes = (int) filebytes - partialbytes; 443 if (blockbytes > 0) { 444 skipbytes = uio->uio_resid - 445 blockbytes; 446 uio->uio_resid = blockbytes; 447 error = ffs_rawread_main(vp, uio); 448 uio->uio_resid += skipbytes; 449 if (error != 0) 450 return error; 451 /* Read remaining part using buffer */ 452 } 453 } 454 } 455 } 456 *workdone = 0; 457 return 0; 458 } 459