1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2000-2003 Tor Egge 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/fcntl.h> 35 #include <sys/file.h> 36 #include <sys/stat.h> 37 #include <sys/proc.h> 38 #include <sys/limits.h> 39 #include <sys/mount.h> 40 #include <sys/namei.h> 41 #include <sys/vnode.h> 42 #include <sys/conf.h> 43 #include <sys/filio.h> 44 #include <sys/ttycom.h> 45 #include <sys/bio.h> 46 #include <sys/buf.h> 47 #include <sys/rwlock.h> 48 #include <ufs/ufs/extattr.h> 49 #include <ufs/ufs/quota.h> 50 #include <ufs/ufs/inode.h> 51 #include <ufs/ufs/ufsmount.h> 52 #include <ufs/ufs/ufs_extern.h> 53 #include <ufs/ffs/fs.h> 54 #include <ufs/ffs/ffs_extern.h> 55 56 #include <vm/vm.h> 57 #include <vm/vm_extern.h> 58 #include <vm/vm_object.h> 59 #include <sys/kernel.h> 60 #include <sys/sysctl.h> 61 62 static int ffs_rawread_readahead(struct vnode *vp, 63 caddr_t udata, 64 off_t offset, 65 size_t len, 66 struct thread *td, 67 struct buf *bp); 68 static int ffs_rawread_main(struct vnode *vp, 69 struct uio *uio); 70 71 static int ffs_rawread_sync(struct vnode *vp); 72 73 int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); 74 75 SYSCTL_DECL(_vfs_ffs); 76 77 static uma_zone_t ffsraw_pbuf_zone; 78 79 static int allowrawread = 1; 80 SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, 81 "Flag to enable raw reads"); 82 83 static int rawreadahead = 1; 84 SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, 85 "Flag to enable readahead for long raw reads"); 86 87 static void 88 ffs_rawread_setup(void *arg __unused) 89 { 90 91 ffsraw_pbuf_zone = pbuf_zsecond_create("ffsrawpbuf", 92 (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8); 93 } 94 SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); 95 96 static int 97 ffs_rawread_sync(struct vnode *vp) 98 { 99 int error; 100 int upgraded; 101 struct bufobj *bo; 102 struct mount *mp; 103 vm_object_t obj; 104 105 /* Check for dirty mmap, pending writes and dirty buffers */ 106 bo = &vp->v_bufobj; 107 BO_LOCK(bo); 108 VI_LOCK(vp); 109 if (bo->bo_numoutput > 0 || 110 bo->bo_dirty.bv_cnt > 0 || 111 ((obj = vp->v_object) != NULL && 112 vm_object_mightbedirty(obj))) { 113 VI_UNLOCK(vp); 114 BO_UNLOCK(bo); 115 116 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 117 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 118 upgraded = 1; 119 else 120 upgraded = 0; 121 VOP_UNLOCK(vp); 122 (void) vn_start_write(vp, &mp, V_WAIT); 123 VOP_LOCK(vp, LK_EXCLUSIVE); 124 } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 125 upgraded = 1; 126 /* Upgrade to exclusive lock, this might block */ 127 VOP_LOCK(vp, LK_UPGRADE); 128 } else 129 upgraded = 0; 130 131 132 VI_LOCK(vp); 133 /* Check if vnode was reclaimed while unlocked. */ 134 if (VN_IS_DOOMED(vp)) { 135 VI_UNLOCK(vp); 136 if (upgraded != 0) 137 VOP_LOCK(vp, LK_DOWNGRADE); 138 vn_finished_write(mp); 139 return (EIO); 140 } 141 /* Attempt to msync mmap() regions to clean dirty mmap */ 142 if ((obj = vp->v_object) != NULL && 143 vm_object_mightbedirty(obj)) { 144 VI_UNLOCK(vp); 145 VM_OBJECT_WLOCK(obj); 146 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); 147 VM_OBJECT_WUNLOCK(obj); 148 } else 149 VI_UNLOCK(vp); 150 151 /* Wait for pending writes to complete */ 152 BO_LOCK(bo); 153 error = bufobj_wwait(&vp->v_bufobj, 0, 0); 154 if (error != 0) { 155 /* XXX: can't happen with a zero timeout ??? */ 156 BO_UNLOCK(bo); 157 if (upgraded != 0) 158 VOP_LOCK(vp, LK_DOWNGRADE); 159 vn_finished_write(mp); 160 return (error); 161 } 162 /* Flush dirty buffers */ 163 if (bo->bo_dirty.bv_cnt > 0) { 164 BO_UNLOCK(bo); 165 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { 166 if (upgraded != 0) 167 VOP_LOCK(vp, LK_DOWNGRADE); 168 vn_finished_write(mp); 169 return (error); 170 } 171 BO_LOCK(bo); 172 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 173 panic("ffs_rawread_sync: dirty bufs"); 174 } 175 BO_UNLOCK(bo); 176 if (upgraded != 0) 177 VOP_LOCK(vp, LK_DOWNGRADE); 178 vn_finished_write(mp); 179 } else { 180 VI_UNLOCK(vp); 181 BO_UNLOCK(bo); 182 } 183 return 0; 184 } 185 186 static int 187 ffs_rawread_readahead(struct vnode *vp, 188 caddr_t udata, 189 off_t offset, 190 size_t len, 191 struct thread *td, 192 struct buf *bp) 193 { 194 int error; 195 u_int iolen; 196 off_t blockno; 197 int blockoff; 198 int bsize; 199 struct vnode *dp; 200 int bforwards; 201 struct inode *ip; 202 ufs2_daddr_t blkno; 203 204 bsize = vp->v_mount->mnt_stat.f_iosize; 205 206 ip = VTOI(vp); 207 dp = ITODEVVP(ip); 208 209 iolen = ((vm_offset_t) udata) & PAGE_MASK; 210 bp->b_bcount = len; 211 if (bp->b_bcount + iolen > bp->b_kvasize) { 212 bp->b_bcount = bp->b_kvasize; 213 if (iolen != 0) 214 bp->b_bcount -= PAGE_SIZE; 215 } 216 bp->b_flags = 0; /* XXX necessary ? */ 217 bp->b_iocmd = BIO_READ; 218 bp->b_iodone = bdone; 219 blockno = offset / bsize; 220 blockoff = (offset % bsize) / DEV_BSIZE; 221 if ((daddr_t) blockno != blockno) { 222 return EINVAL; /* blockno overflow */ 223 } 224 225 bp->b_lblkno = bp->b_blkno = blockno; 226 227 error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); 228 if (error != 0) 229 return error; 230 if (blkno == -1) { 231 /* Fill holes with NULs to preserve semantics */ 232 233 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) 234 bp->b_bcount = bsize - blockoff * DEV_BSIZE; 235 236 if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0) 237 return EFAULT; 238 239 maybe_yield(); 240 bzero(bp->b_data, bp->b_bufsize); 241 242 /* Mark operation completed (similar to bufdone()) */ 243 244 bp->b_resid = 0; 245 bp->b_flags |= B_DONE; 246 return 0; 247 } 248 bp->b_blkno = blkno + blockoff; 249 bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; 250 251 if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) 252 bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; 253 254 if (vmapbuf(bp, udata, bp->b_bcount, 1) < 0) 255 return EFAULT; 256 257 BO_STRATEGY(&dp->v_bufobj, bp); 258 return 0; 259 } 260 261 static int 262 ffs_rawread_main(struct vnode *vp, 263 struct uio *uio) 264 { 265 int error, nerror; 266 struct buf *bp, *nbp, *tbp; 267 u_int iolen; 268 caddr_t udata; 269 long resid; 270 off_t offset; 271 struct thread *td; 272 273 td = uio->uio_td ? uio->uio_td : curthread; 274 udata = uio->uio_iov->iov_base; 275 resid = uio->uio_resid; 276 offset = uio->uio_offset; 277 278 /* 279 * keep the process from being swapped 280 */ 281 PHOLD(td->td_proc); 282 283 error = 0; 284 nerror = 0; 285 286 bp = NULL; 287 nbp = NULL; 288 289 while (resid > 0) { 290 291 if (bp == NULL) { /* Setup first read */ 292 bp = uma_zalloc(ffsraw_pbuf_zone, M_WAITOK); 293 pbgetvp(vp, bp); 294 error = ffs_rawread_readahead(vp, udata, offset, 295 resid, td, bp); 296 if (error != 0) 297 break; 298 299 if (resid > bp->b_bufsize) { /* Setup fist readahead */ 300 if (rawreadahead != 0) 301 nbp = uma_zalloc(ffsraw_pbuf_zone, 302 M_NOWAIT); 303 else 304 nbp = NULL; 305 if (nbp != NULL) { 306 pbgetvp(vp, nbp); 307 308 nerror = ffs_rawread_readahead(vp, 309 udata + 310 bp->b_bufsize, 311 offset + 312 bp->b_bufsize, 313 resid - 314 bp->b_bufsize, 315 td, 316 nbp); 317 if (nerror) { 318 pbrelvp(nbp); 319 uma_zfree(ffsraw_pbuf_zone, 320 nbp); 321 nbp = NULL; 322 } 323 } 324 } 325 } 326 327 bwait(bp, PRIBIO, "rawrd"); 328 vunmapbuf(bp); 329 330 iolen = bp->b_bcount - bp->b_resid; 331 if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { 332 nerror = 0; /* Ignore possible beyond EOF error */ 333 break; /* EOF */ 334 } 335 336 if ((bp->b_ioflags & BIO_ERROR) != 0) { 337 error = bp->b_error; 338 break; 339 } 340 resid -= iolen; 341 udata += iolen; 342 offset += iolen; 343 if (iolen < bp->b_bufsize) { 344 /* Incomplete read. Try to read remaining part */ 345 error = ffs_rawread_readahead(vp, 346 udata, 347 offset, 348 bp->b_bufsize - iolen, 349 td, 350 bp); 351 if (error != 0) 352 break; 353 } else if (nbp != NULL) { /* Complete read with readahead */ 354 355 tbp = bp; 356 bp = nbp; 357 nbp = tbp; 358 359 if (resid <= bp->b_bufsize) { /* No more readaheads */ 360 pbrelvp(nbp); 361 uma_zfree(ffsraw_pbuf_zone, nbp); 362 nbp = NULL; 363 } else { /* Setup next readahead */ 364 nerror = ffs_rawread_readahead(vp, 365 udata + 366 bp->b_bufsize, 367 offset + 368 bp->b_bufsize, 369 resid - 370 bp->b_bufsize, 371 td, 372 nbp); 373 if (nerror != 0) { 374 pbrelvp(nbp); 375 uma_zfree(ffsraw_pbuf_zone, nbp); 376 nbp = NULL; 377 } 378 } 379 } else if (nerror != 0) {/* Deferred Readahead error */ 380 break; 381 } else if (resid > 0) { /* More to read, no readahead */ 382 error = ffs_rawread_readahead(vp, udata, offset, 383 resid, td, bp); 384 if (error != 0) 385 break; 386 } 387 } 388 389 if (bp != NULL) { 390 pbrelvp(bp); 391 uma_zfree(ffsraw_pbuf_zone, bp); 392 } 393 if (nbp != NULL) { /* Run down readahead buffer */ 394 bwait(nbp, PRIBIO, "rawrd"); 395 vunmapbuf(nbp); 396 pbrelvp(nbp); 397 uma_zfree(ffsraw_pbuf_zone, nbp); 398 } 399 400 if (error == 0) 401 error = nerror; 402 PRELE(td->td_proc); 403 uio->uio_iov->iov_base = udata; 404 uio->uio_resid = resid; 405 uio->uio_offset = offset; 406 return error; 407 } 408 409 int 410 ffs_rawread(struct vnode *vp, 411 struct uio *uio, 412 int *workdone) 413 { 414 if (allowrawread != 0 && 415 uio->uio_iovcnt == 1 && 416 uio->uio_segflg == UIO_USERSPACE && 417 uio->uio_resid == uio->uio_iov->iov_len && 418 (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & 419 TDP_DEADLKTREAT) == 0) { 420 int secsize; /* Media sector size */ 421 off_t filebytes; /* Bytes left of file */ 422 int blockbytes; /* Bytes left of file in full blocks */ 423 int partialbytes; /* Bytes in last partial block */ 424 int skipbytes; /* Bytes not to read in ffs_rawread */ 425 struct inode *ip; 426 int error; 427 428 429 /* Only handle sector aligned reads */ 430 ip = VTOI(vp); 431 secsize = ITODEVVP(ip)->v_bufobj.bo_bsize; 432 if ((uio->uio_offset & (secsize - 1)) == 0 && 433 (uio->uio_resid & (secsize - 1)) == 0) { 434 435 /* Sync dirty pages and buffers if needed */ 436 error = ffs_rawread_sync(vp); 437 if (error != 0) 438 return error; 439 440 /* Check for end of file */ 441 if (ip->i_size > uio->uio_offset) { 442 filebytes = ip->i_size - uio->uio_offset; 443 444 /* No special eof handling needed ? */ 445 if (uio->uio_resid <= filebytes) { 446 *workdone = 1; 447 return ffs_rawread_main(vp, uio); 448 } 449 450 partialbytes = ((unsigned int) ip->i_size) % 451 ITOFS(ip)->fs_bsize; 452 blockbytes = (int) filebytes - partialbytes; 453 if (blockbytes > 0) { 454 skipbytes = uio->uio_resid - 455 blockbytes; 456 uio->uio_resid = blockbytes; 457 error = ffs_rawread_main(vp, uio); 458 uio->uio_resid += skipbytes; 459 if (error != 0) 460 return error; 461 /* Read remaining part using buffer */ 462 } 463 } 464 } 465 } 466 *workdone = 0; 467 return 0; 468 } 469