1 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/thread.h> 41 #include <sys/t_lock.h> 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/bitmap.h> 45 #include <sys/buf.h> 46 #include <sys/cmn_err.h> 47 #include <sys/conf.h> 48 #include <sys/ddi.h> 49 #include <sys/debug.h> 50 #include <sys/errno.h> 51 #include <sys/time.h> 52 #include <sys/fcntl.h> 53 #include <sys/flock.h> 54 #include <sys/file.h> 55 #include <sys/kmem.h> 56 #include <sys/mman.h> 57 #include <sys/vmsystm.h> 58 #include <sys/open.h> 59 #include <sys/swap.h> 60 #include <sys/sysmacros.h> 61 #include <sys/uio.h> 62 #include <sys/vfs.h> 63 #include <sys/vfs_opreg.h> 64 #include <sys/vnode.h> 65 #include <sys/stat.h> 66 #include <sys/poll.h> 67 #include <sys/zmod.h> 68 #include <sys/fs/decomp.h> 69 70 #include <vm/hat.h> 71 #include <vm/as.h> 72 #include <vm/page.h> 73 #include <vm/pvn.h> 74 #include <vm/seg_vn.h> 75 #include <vm/seg_kmem.h> 76 #include <vm/seg_map.h> 77 78 #include <fs/fs_subr.h> 79 80 /* 81 * dcfs - A filesystem for automatic decompressing of fiocompressed files 82 * 83 * This filesystem is a layered filesystem that sits on top of a normal 84 * persistent filesystem and provides automatic decompression of files 85 * that have been previously compressed and stored on the host file system. 86 * This is a pseudo filesystem in that it does not persist data, rather it 87 * intercepts file lookup requests on the host filesystem and provides 88 * transparent decompression of those files. Currently the only supported 89 * host filesystem is ufs. 90 * 91 * A file is compressed via a userland utility (currently cmd/boot/fiocompress) 92 * and marked by fiocompress as a compressed file via a flag in the on-disk 93 * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED 94 * ufs_lookup checks for this flag and if set, passes control to decompvp 95 * a function defined in this (dcfs) filesystem. decomvp uncompresses the file 96 * and returns a dcfs vnode to the VFS layer. 97 * 98 * dcfs is layered on top of ufs and passes requests involving persistence 99 * to the underlying ufs filesystem. The compressed files currently cannot be 100 * written to. 101 */ 102 103 104 /* 105 * Define data structures within this file. 106 */ 107 #define DCSHFT 5 108 #define DCTABLESIZE 16 109 110 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0) 111 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1)) 112 #else 113 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC) 114 #endif 115 116 #define DCLRUSIZE 16 117 118 #define DCCACHESIZE 4 119 120 #define rounddown(x, y) ((x) & ~((y) - 1)) 121 122 struct dcnode *dctable[DCTABLESIZE]; 123 124 struct dcnode *dclru; 125 static int dclru_len; 126 127 kmutex_t dctable_lock; 128 129 dev_t dcdev; 130 struct vfs dc_vfs; 131 132 struct kmem_cache *dcnode_cache; 133 struct kmem_cache *dcbuf_cache[DCCACHESIZE]; 134 135 kmutex_t dccache_lock; 136 137 static int dcinit(int, char *); 138 139 static struct dcnode *dcnode_alloc(void); 140 static void dcnode_free(struct dcnode *); 141 static void dcnode_recycle(struct dcnode *); 142 143 static void dcinsert(struct dcnode *); 144 static void dcdelete(struct dcnode *); 145 static struct dcnode *dcfind(struct vnode *); 146 static void dclru_add(struct dcnode *); 147 static void dclru_sub(struct dcnode *); 148 149 150 /* 151 * This is the loadable module wrapper. 152 */ 153 #include <sys/modctl.h> 154 155 struct vfsops *dc_vfsops; 156 157 static vfsdef_t vfw = { 158 VFSDEF_VERSION, 159 "dcfs", 160 dcinit, 161 VSW_ZMOUNT, 162 NULL 163 }; 164 165 /* 166 * Module linkage information for the kernel. 167 */ 168 extern struct mod_ops mod_fsops; 169 170 static struct modlfs modlfs = { 171 &mod_fsops, "compressed filesystem", &vfw 172 }; 173 174 static struct modlinkage modlinkage = { 175 MODREV_1, (void *)&modlfs, NULL 176 }; 177 178 int 179 _init() 180 { 181 return (mod_install(&modlinkage)); 182 } 183 184 int 185 _info(struct modinfo *modinfop) 186 { 187 return (mod_info(&modlinkage, modinfop)); 188 } 189 190 191 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *); 192 static int dc_close(struct vnode *, int, int, offset_t, 193 struct cred *, caller_context_t *); 194 static int dc_read(struct vnode *, struct uio *, int, struct cred *, 195 struct caller_context *); 196 static int dc_getattr(struct vnode *, struct vattr *, int, 197 struct cred *, caller_context_t *); 198 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *, 199 struct caller_context *); 200 static int dc_access(struct vnode *, int, int, 201 struct cred *, caller_context_t *); 202 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *); 203 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *); 204 static int dc_fid(struct vnode *, struct fid *, caller_context_t *); 205 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *); 206 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 207 struct flk_callback *, struct cred *, caller_context_t *); 208 static int dc_realvp(struct vnode *, struct vnode **, caller_context_t *); 209 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *, 210 struct page **, size_t, struct seg *, caddr_t, enum seg_rw, 211 struct cred *, caller_context_t *); 212 static int dc_putpage(struct vnode *, offset_t, size_t, int, 213 struct cred *, caller_context_t *); 214 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 215 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 216 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 217 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 218 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 219 uint_t, uint_t, uint_t, struct cred *, caller_context_t *); 220 221 struct vnodeops *dc_vnodeops; 222 223 const fs_operation_def_t dc_vnodeops_template[] = { 224 VOPNAME_OPEN, { .vop_open = dc_open }, 225 VOPNAME_CLOSE, { .vop_close = dc_close }, 226 VOPNAME_READ, { .vop_read = dc_read }, 227 VOPNAME_GETATTR, { .vop_getattr = dc_getattr }, 228 VOPNAME_SETATTR, { .vop_setattr = dc_setattr }, 229 VOPNAME_ACCESS, { .vop_access = dc_access }, 230 VOPNAME_FSYNC, { .vop_fsync = dc_fsync }, 231 VOPNAME_INACTIVE, { .vop_inactive = dc_inactive }, 232 VOPNAME_FID, { .vop_fid = dc_fid }, 233 VOPNAME_SEEK, { .vop_seek = dc_seek }, 234 VOPNAME_FRLOCK, { .vop_frlock = dc_frlock }, 235 VOPNAME_REALVP, { .vop_realvp = dc_realvp }, 236 VOPNAME_GETPAGE, { .vop_getpage = dc_getpage }, 237 VOPNAME_PUTPAGE, { .vop_putpage = dc_putpage }, 238 VOPNAME_MAP, { .vop_map = dc_map }, 239 VOPNAME_ADDMAP, { .vop_addmap = dc_addmap }, 240 VOPNAME_DELMAP, { .vop_delmap = dc_delmap }, 241 NULL, NULL 242 }; 243 244 /*ARGSUSED*/ 245 static int 246 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp) 247 { 248 return (0); 249 } 250 251 /*ARGSUSED*/ 252 static int 253 dc_close(struct vnode *vp, int flag, int count, offset_t off, 254 struct cred *cr, caller_context_t *ctp) 255 { 256 (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 257 cleanshares(vp, ttoproc(curthread)->p_pid); 258 return (0); 259 } 260 261 /*ARGSUSED*/ 262 static int 263 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 264 struct caller_context *ct) 265 { 266 struct dcnode *dp = VTODC(vp); 267 size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize); 268 size_t fsize = dp->dc_hdr->ch_fsize; 269 int error; 270 271 /* 272 * Loop through file with segmap, decompression will occur 273 * in dc_getapage 274 */ 275 do { 276 caddr_t base; 277 size_t n; 278 offset_t mapon; 279 280 /* 281 * read to end of block or file 282 */ 283 mapon = uiop->uio_loffset & (rdsize - 1); 284 n = MIN(rdsize - mapon, uiop->uio_resid); 285 n = MIN(n, fsize - uiop->uio_loffset); 286 if (n == 0) 287 return (0); /* at EOF */ 288 289 base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1, 290 S_READ); 291 error = uiomove(base + mapon, n, UIO_READ, uiop); 292 if (!error) { 293 uint_t flags; 294 295 if (n + mapon == rdsize || uiop->uio_loffset == fsize) 296 flags = SM_DONTNEED; 297 else 298 flags = 0; 299 error = segmap_release(segkmap, base, flags); 300 } else 301 (void) segmap_release(segkmap, base, 0); 302 } while (!error && uiop->uio_resid); 303 304 return (error); 305 } 306 307 static int 308 dc_getattr(struct vnode *vp, struct vattr *vap, int flags, 309 cred_t *cred, caller_context_t *ctp) 310 { 311 struct dcnode *dp = VTODC(vp); 312 struct vnode *subvp = dp->dc_subvp; 313 int error; 314 315 error = VOP_GETATTR(subvp, vap, flags, cred, ctp); 316 317 /* substitute uncompressed size */ 318 vap->va_size = dp->dc_hdr->ch_fsize; 319 return (error); 320 } 321 322 static int 323 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred, 324 caller_context_t *ctp) 325 { 326 struct dcnode *dp = VTODC(vp); 327 struct vnode *subvp = dp->dc_subvp; 328 329 return (VOP_SETATTR(subvp, vap, flags, cred, ctp)); 330 } 331 332 static int 333 dc_access(struct vnode *vp, int mode, int flags, 334 cred_t *cred, caller_context_t *ctp) 335 { 336 struct dcnode *dp = VTODC(vp); 337 struct vnode *subvp = dp->dc_subvp; 338 339 return (VOP_ACCESS(subvp, mode, flags, cred, ctp)); 340 } 341 342 /*ARGSUSED*/ 343 static int 344 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp) 345 { 346 return (0); 347 } 348 349 /*ARGSUSED*/ 350 static void 351 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp) 352 { 353 struct dcnode *dp = VTODC(vp); 354 355 mutex_enter(&dctable_lock); 356 mutex_enter(&vp->v_lock); 357 ASSERT(vp->v_count >= 1); 358 if (--vp->v_count != 0) { 359 /* 360 * Somebody accessed the dcnode before we got a chance to 361 * remove it. They will remove it when they do a vn_rele. 362 */ 363 mutex_exit(&vp->v_lock); 364 mutex_exit(&dctable_lock); 365 return; 366 } 367 mutex_exit(&vp->v_lock); 368 369 dcnode_free(dp); 370 371 mutex_exit(&dctable_lock); 372 } 373 374 static int 375 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp) 376 { 377 struct dcnode *dp = VTODC(vp); 378 struct vnode *subvp = dp->dc_subvp; 379 380 return (VOP_FID(subvp, fidp, ctp)); 381 } 382 383 static int 384 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp) 385 { 386 struct dcnode *dp = VTODC(vp); 387 struct vnode *subvp = dp->dc_subvp; 388 389 return (VOP_SEEK(subvp, oof, noffp, ctp)); 390 } 391 392 static int 393 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 394 offset_t offset, struct flk_callback *flk_cbp, 395 cred_t *cr, caller_context_t *ctp) 396 { 397 struct dcnode *dp = VTODC(vp); 398 int error; 399 struct vattr vattr; 400 401 /* 402 * If file is being mapped, disallow frlock. 403 */ 404 vattr.va_mask = AT_MODE; 405 if (error = VOP_GETATTR(dp->dc_subvp, &vattr, 0, cr, ctp)) 406 return (error); 407 if (dp->dc_mapcnt > 0 && MANDLOCK(vp, vattr.va_mode)) 408 return (EAGAIN); 409 410 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp)); 411 } 412 413 /*ARGSUSED*/ 414 static int 415 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 416 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 417 { 418 struct dcnode *dp = VTODC(vp); 419 struct comphdr *hdr = dp->dc_hdr; 420 struct page *pp; 421 struct buf *bp; 422 caddr_t saddr; 423 off_t cblkno; 424 size_t rdoff, rdsize, dsize; 425 long xlen; 426 int error, zerr; 427 428 ASSERT(len == hdr->ch_blksize); 429 /* 430 * Get destination pages and make them addressable 431 */ 432 pp = page_create_va(vp, off, len, PG_WAIT, seg, addr); 433 bp = pageio_setup(pp, len, vp, B_READ); 434 bp_mapin(bp); 435 436 /* 437 * read compressed data from subordinate vnode 438 */ 439 saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP); 440 cblkno = off / len; 441 rdoff = hdr->ch_blkmap[cblkno]; 442 rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff; 443 error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff, 444 UIO_SYSSPACE, 0, 0, cr, NULL); 445 if (error) 446 goto cleanup; 447 448 /* 449 * Uncompress 450 */ 451 dsize = len; 452 zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax); 453 if (zerr != Z_OK) { 454 error = EIO; 455 goto cleanup; 456 } 457 458 /* 459 * Handle EOF 460 */ 461 xlen = hdr->ch_fsize - off; 462 if (xlen < len) { 463 bzero(bp->b_un.b_addr + xlen, len - xlen); 464 if (dsize != xlen) 465 error = EIO; 466 } else if (dsize != len) 467 error = EIO; 468 469 /* 470 * Clean up 471 */ 472 cleanup: 473 kmem_cache_free(dp->dc_bufcache, saddr); 474 pageio_done(bp); 475 *ppp = pp; 476 return (error); 477 } 478 479 static int 480 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 481 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 482 { 483 struct page *pp, *plist = NULL; 484 offset_t pgoff; 485 int rdblk; 486 487 /* 488 * pvn_read_kluster() doesn't quite do what we want, since it 489 * thinks sub block reads are ok. Here we always decompress 490 * a full block. 491 */ 492 493 /* 494 * Check page cache 495 */ 496 rdblk = 0; 497 for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) { 498 pp = page_lookup(vp, pgoff, SE_EXCL); 499 if (pp == NULL) { 500 rdblk = 1; 501 break; 502 } 503 page_io_lock(pp); 504 page_add(&plist, pp); 505 plist = plist->p_next; 506 } 507 if (!rdblk) { 508 *ppp = plist; 509 return (0); /* all pages in cache */ 510 } 511 512 /* 513 * Undo any locks so getblock_miss has an open field 514 */ 515 if (plist != NULL) 516 pvn_io_done(plist); 517 518 return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr)); 519 } 520 521 static int 522 dc_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 523 { 524 struct vnode *rvp; 525 526 vp = VTODC(vp)->dc_subvp; 527 if (VOP_REALVP(vp, &rvp, ct) == 0) 528 vp = rvp; 529 *vpp = vp; 530 return (0); 531 } 532 533 /*ARGSUSED10*/ 534 static int 535 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 536 struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr, 537 enum seg_rw rw, struct cred *cr, caller_context_t *ctp) 538 { 539 struct dcnode *dp = VTODC(vp); 540 struct comphdr *hdr = dp->dc_hdr; 541 struct page *pp, *plist = NULL; 542 caddr_t vp_baddr; 543 offset_t vp_boff, vp_bend; 544 size_t bsize = hdr->ch_blksize; 545 int nblks, error; 546 547 /* does not support write */ 548 if (rw == S_WRITE) { 549 panic("write attempt on compressed file"); 550 /*NOTREACHED*/ 551 } 552 553 if (protp) 554 *protp = PROT_ALL; 555 /* 556 * We don't support asynchronous operation at the moment, so 557 * just pretend we did it. If the pages are ever actually 558 * needed, they'll get brought in then. 559 */ 560 if (pl == NULL) 561 return (0); 562 563 /* 564 * Calc block start and end offsets 565 */ 566 vp_boff = rounddown(off, bsize); 567 vp_bend = roundup(off + len, bsize); 568 vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize); 569 570 nblks = (vp_bend - vp_boff) / bsize; 571 while (nblks--) { 572 error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr, 573 rw, cr); 574 page_list_concat(&plist, &pp); 575 vp_boff += bsize; 576 vp_baddr += bsize; 577 } 578 if (!error) 579 pvn_plist_init(plist, pl, plsz, off, len, rw); 580 else 581 pvn_read_done(plist, B_ERROR); 582 return (error); 583 } 584 585 /* 586 * This function should never be called. We need to have it to pass 587 * it as an argument to other functions. 588 */ 589 /*ARGSUSED*/ 590 static int 591 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp, 592 int flags, struct cred *cr) 593 { 594 /* should never happen */ 595 cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page"); 596 /*NOTREACHED*/ 597 return (0); 598 } 599 600 601 /* 602 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED. 603 * B_INVAL is set by: 604 * 605 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag. 606 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice 607 * which translates to an MC_SYNC with the MS_INVALIDATE flag. 608 * 609 * The B_FREE (as well as the B_DONTNEED) flag is set when the 610 * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked 611 * from SEGVN to release pages behind a pagefault. 612 */ 613 /*ARGSUSED5*/ 614 static int 615 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags, 616 struct cred *cr, caller_context_t *ctp) 617 { 618 int error = 0; 619 620 if (vp->v_count == 0) { 621 panic("dcfs_putpage: bad v_count"); 622 /*NOTREACHED*/ 623 } 624 625 if (vp->v_flag & VNOMAP) 626 return (ENOSYS); 627 628 if (!vn_has_cached_data(vp)) /* no pages mapped */ 629 return (0); 630 631 if (len == 0) /* from 'off' to EOF */ 632 error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr); 633 else { 634 offset_t io_off; 635 se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 636 637 for (io_off = off; io_off < off + len; io_off += PAGESIZE) { 638 page_t *pp; 639 640 /* 641 * We insist on getting the page only if we are 642 * about to invalidate, free or write it and 643 * the B_ASYNC flag is not set. 644 */ 645 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) 646 pp = page_lookup(vp, io_off, se); 647 else 648 pp = page_lookup_nowait(vp, io_off, se); 649 650 if (pp == NULL) 651 continue; 652 /* 653 * Normally pvn_getdirty() should return 0, which 654 * impies that it has done the job for us. 655 * The shouldn't-happen scenario is when it returns 1. 656 * This means that the page has been modified and 657 * needs to be put back. 658 * Since we can't write to a dcfs compressed file, 659 * we fake a failed I/O and force pvn_write_done() 660 * to destroy the page. 661 */ 662 if (pvn_getdirty(pp, flags) == 1) { 663 cmn_err(CE_NOTE, "dc_putpage: dirty page"); 664 pvn_write_done(pp, flags | 665 B_ERROR | B_WRITE | B_INVAL | B_FORCE); 666 } 667 } 668 } 669 return (error); 670 } 671 672 static int 673 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp, 674 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 675 struct cred *cred, caller_context_t *ctp) 676 { 677 struct vattr vattr; 678 struct segvn_crargs vn_a; 679 int error; 680 681 if (vp->v_flag & VNOMAP) 682 return (ENOSYS); 683 684 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) 685 return (ENXIO); 686 687 /* 688 * If file is being locked, disallow mapping. 689 */ 690 if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp)) 691 return (error); 692 if (vn_has_mandatory_locks(vp, vattr.va_mode)) 693 return (EAGAIN); 694 695 as_rangelock(as); 696 697 if ((flags & MAP_FIXED) == 0) { 698 map_addr(addrp, len, off, 1, flags); 699 if (*addrp == NULL) { 700 as_rangeunlock(as); 701 return (ENOMEM); 702 } 703 } else { 704 /* 705 * User specified address - blow away any previous mappings 706 */ 707 (void) as_unmap(as, *addrp, len); 708 } 709 710 vn_a.vp = vp; 711 vn_a.offset = off; 712 vn_a.type = flags & MAP_TYPE; 713 vn_a.prot = prot; 714 vn_a.maxprot = maxprot; 715 vn_a.flags = flags & ~MAP_TYPE; 716 vn_a.cred = cred; 717 vn_a.amp = NULL; 718 vn_a.szc = 0; 719 vn_a.lgrp_mem_policy_flags = 0; 720 721 error = as_map(as, *addrp, len, segvn_create, &vn_a); 722 as_rangeunlock(as); 723 return (error); 724 } 725 726 /*ARGSUSED*/ 727 static int 728 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 729 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 730 struct cred *cr, caller_context_t *ctp) 731 { 732 struct dcnode *dp; 733 734 if (vp->v_flag & VNOMAP) 735 return (ENOSYS); 736 737 dp = VTODC(vp); 738 mutex_enter(&dp->dc_lock); 739 dp->dc_mapcnt += btopr(len); 740 mutex_exit(&dp->dc_lock); 741 return (0); 742 } 743 744 /*ARGSUSED*/ 745 static int 746 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 747 size_t len, uint_t prot, uint_t maxprot, uint_t flags, 748 struct cred *cr, caller_context_t *ctp) 749 { 750 struct dcnode *dp; 751 752 if (vp->v_flag & VNOMAP) 753 return (ENOSYS); 754 755 dp = VTODC(vp); 756 mutex_enter(&dp->dc_lock); 757 dp->dc_mapcnt -= btopr(len); 758 ASSERT(dp->dc_mapcnt >= 0); 759 mutex_exit(&dp->dc_lock); 760 return (0); 761 } 762 763 /* 764 * Constructor/destructor routines for dcnodes 765 */ 766 /*ARGSUSED1*/ 767 static int 768 dcnode_constructor(void *buf, void *cdrarg, int kmflags) 769 { 770 struct dcnode *dp = buf; 771 struct vnode *vp; 772 773 vp = dp->dc_vp = vn_alloc(kmflags); 774 if (vp == NULL) { 775 return (-1); 776 } 777 vp->v_data = dp; 778 vp->v_type = VREG; 779 vp->v_flag = VNOSWAP; 780 vp->v_vfsp = &dc_vfs; 781 vn_setops(vp, dc_vnodeops); 782 vn_exists(vp); 783 784 mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL); 785 dp->dc_mapcnt = 0; 786 dp->dc_lrunext = dp->dc_lruprev = NULL; 787 dp->dc_hdr = NULL; 788 dp->dc_subvp = NULL; 789 return (0); 790 } 791 792 /*ARGSUSED*/ 793 static void 794 dcnode_destructor(void *buf, void *cdrarg) 795 { 796 struct dcnode *dp = buf; 797 struct vnode *vp = DCTOV(dp); 798 799 mutex_destroy(&dp->dc_lock); 800 801 VERIFY(dp->dc_hdr == NULL); 802 VERIFY(dp->dc_subvp == NULL); 803 vn_invalid(vp); 804 vn_free(vp); 805 } 806 807 static struct dcnode * 808 dcnode_alloc(void) 809 { 810 struct dcnode *dp; 811 812 /* 813 * If the free list is above DCLRUSIZE 814 * re-use one from it 815 */ 816 mutex_enter(&dctable_lock); 817 if (dclru_len < DCLRUSIZE) { 818 mutex_exit(&dctable_lock); 819 dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP); 820 } else { 821 ASSERT(dclru != NULL); 822 dp = dclru; 823 dclru_sub(dp); 824 dcdelete(dp); 825 mutex_exit(&dctable_lock); 826 dcnode_recycle(dp); 827 } 828 return (dp); 829 } 830 831 static void 832 dcnode_free(struct dcnode *dp) 833 { 834 struct vnode *vp = DCTOV(dp); 835 836 ASSERT(MUTEX_HELD(&dctable_lock)); 837 838 /* 839 * If no cached pages, no need to put it on lru 840 */ 841 if (!vn_has_cached_data(vp)) { 842 dcdelete(dp); 843 dcnode_recycle(dp); 844 kmem_cache_free(dcnode_cache, dp); 845 return; 846 } 847 848 /* 849 * Add to lru, if it's over the limit, free from head 850 */ 851 dclru_add(dp); 852 if (dclru_len > DCLRUSIZE) { 853 dp = dclru; 854 dclru_sub(dp); 855 dcdelete(dp); 856 dcnode_recycle(dp); 857 kmem_cache_free(dcnode_cache, dp); 858 } 859 } 860 861 static void 862 dcnode_recycle(struct dcnode *dp) 863 { 864 struct vnode *vp; 865 866 vp = DCTOV(dp); 867 868 VN_RELE(dp->dc_subvp); 869 dp->dc_subvp = NULL; 870 (void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL); 871 kmem_free(dp->dc_hdr, dp->dc_hdrsize); 872 dp->dc_hdr = NULL; 873 dp->dc_hdrsize = dp->dc_zmax = 0; 874 dp->dc_bufcache = NULL; 875 dp->dc_mapcnt = 0; 876 vn_reinit(vp); 877 vp->v_type = VREG; 878 vp->v_flag = VNOSWAP; 879 vp->v_vfsp = &dc_vfs; 880 } 881 882 static int 883 dcinit(int fstype, char *name) 884 { 885 static const fs_operation_def_t dc_vfsops_template[] = { 886 NULL, NULL 887 }; 888 int error; 889 major_t dev; 890 891 error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops); 892 if (error) { 893 cmn_err(CE_WARN, "dcinit: bad vfs ops template"); 894 return (error); 895 } 896 VFS_INIT(&dc_vfs, dc_vfsops, NULL); 897 dc_vfs.vfs_flag = VFS_RDONLY; 898 dc_vfs.vfs_fstype = fstype; 899 if ((dev = getudev()) == (major_t)-1) 900 dev = 0; 901 dcdev = makedevice(dev, 0); 902 dc_vfs.vfs_dev = dcdev; 903 904 error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops); 905 if (error != 0) { 906 (void) vfs_freevfsops_by_type(fstype); 907 cmn_err(CE_WARN, "dcinit: bad vnode ops template"); 908 return (error); 909 } 910 911 mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL); 912 mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL); 913 dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode), 914 0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0); 915 916 return (0); 917 } 918 919 /* 920 * Return shadow vnode with the given vp as its subordinate 921 */ 922 struct vnode * 923 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp) 924 { 925 struct dcnode *dp, *ndp; 926 struct comphdr thdr, *hdr; 927 struct kmem_cache **cpp; 928 struct vattr vattr; 929 size_t hdrsize, bsize; 930 int error; 931 932 /* 933 * See if we have an existing shadow 934 * If none, we have to manufacture one 935 */ 936 mutex_enter(&dctable_lock); 937 dp = dcfind(vp); 938 mutex_exit(&dctable_lock); 939 if (dp != NULL) 940 return (DCTOV(dp)); 941 942 /* 943 * Make sure it's a valid compressed file 944 */ 945 hdr = &thdr; 946 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0, 947 UIO_SYSSPACE, 0, 0, cred, NULL); 948 if (error || hdr->ch_magic != CH_MAGIC_ZLIB || 949 hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB || 950 hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE || 951 hdr->ch_blksize > ptob(DCCACHESIZE) || 952 (hdr->ch_blksize & (hdr->ch_blksize - 1)) != 0) 953 return (NULL); 954 955 /* get underlying file size */ 956 if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0) 957 return (NULL); 958 959 /* 960 * Re-read entire header 961 */ 962 hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t); 963 hdr = kmem_alloc(hdrsize, KM_SLEEP); 964 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE, 965 0, 0, cred, NULL); 966 if (error) { 967 kmem_free(hdr, hdrsize); 968 return (NULL); 969 } 970 971 /* 972 * add extra blkmap entry to make dc_getblock()'s 973 * life easier 974 */ 975 bsize = hdr->ch_blksize; 976 hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size; 977 978 ndp = dcnode_alloc(); 979 ndp->dc_subvp = vp; 980 VN_HOLD(vp); 981 ndp->dc_hdr = hdr; 982 ndp->dc_hdrsize = hdrsize; 983 984 /* 985 * Allocate kmem cache if none there already 986 */ 987 ndp->dc_zmax = ZMAXBUF(bsize); 988 cpp = &dcbuf_cache[btop(bsize)]; 989 mutex_enter(&dccache_lock); 990 if (*cpp == NULL) 991 *cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL, 992 NULL, NULL, NULL, NULL, 0); 993 mutex_exit(&dccache_lock); 994 ndp->dc_bufcache = *cpp; 995 996 /* 997 * Recheck table in case someone else created shadow 998 * while we were blocked above. 999 */ 1000 mutex_enter(&dctable_lock); 1001 dp = dcfind(vp); 1002 if (dp != NULL) { 1003 mutex_exit(&dctable_lock); 1004 dcnode_recycle(ndp); 1005 kmem_cache_free(dcnode_cache, ndp); 1006 return (DCTOV(dp)); 1007 } 1008 dcinsert(ndp); 1009 mutex_exit(&dctable_lock); 1010 1011 return (DCTOV(ndp)); 1012 } 1013 1014 1015 /* 1016 * dcnode lookup table 1017 * These routines maintain a table of dcnodes hashed by their 1018 * subordinate vnode so that they can be found if they already 1019 * exist in the vnode cache 1020 */ 1021 1022 /* 1023 * Put a dcnode in the table. 1024 */ 1025 static void 1026 dcinsert(struct dcnode *newdp) 1027 { 1028 int idx = DCHASH(newdp->dc_subvp); 1029 1030 ASSERT(MUTEX_HELD(&dctable_lock)); 1031 newdp->dc_hash = dctable[idx]; 1032 dctable[idx] = newdp; 1033 } 1034 1035 /* 1036 * Remove a dcnode from the hash table. 1037 */ 1038 void 1039 dcdelete(struct dcnode *deldp) 1040 { 1041 int idx = DCHASH(deldp->dc_subvp); 1042 struct dcnode *dp, *prevdp; 1043 1044 ASSERT(MUTEX_HELD(&dctable_lock)); 1045 dp = dctable[idx]; 1046 if (dp == deldp) 1047 dctable[idx] = dp->dc_hash; 1048 else { 1049 for (prevdp = dp, dp = dp->dc_hash; dp != NULL; 1050 prevdp = dp, dp = dp->dc_hash) { 1051 if (dp == deldp) { 1052 prevdp->dc_hash = dp->dc_hash; 1053 break; 1054 } 1055 } 1056 } 1057 ASSERT(dp != NULL); 1058 } 1059 1060 /* 1061 * Find a shadow vnode in the dctable hash list. 1062 */ 1063 static struct dcnode * 1064 dcfind(struct vnode *vp) 1065 { 1066 struct dcnode *dp; 1067 1068 ASSERT(MUTEX_HELD(&dctable_lock)); 1069 for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash) 1070 if (dp->dc_subvp == vp) { 1071 VN_HOLD(DCTOV(dp)); 1072 if (dp->dc_lrunext) 1073 dclru_sub(dp); 1074 return (dp); 1075 } 1076 return (NULL); 1077 } 1078 1079 #ifdef DEBUG 1080 static int 1081 dclru_count(void) 1082 { 1083 struct dcnode *dp; 1084 int i = 0; 1085 1086 if (dclru == NULL) 1087 return (0); 1088 for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext) 1089 i++; 1090 return (i + 1); 1091 } 1092 #endif 1093 1094 static void 1095 dclru_add(struct dcnode *dp) 1096 { 1097 /* 1098 * Add to dclru as double-link chain 1099 */ 1100 ASSERT(MUTEX_HELD(&dctable_lock)); 1101 if (dclru == NULL) { 1102 dclru = dp; 1103 dp->dc_lruprev = dp->dc_lrunext = dp; 1104 } else { 1105 struct dcnode *last = dclru->dc_lruprev; 1106 1107 dclru->dc_lruprev = dp; 1108 last->dc_lrunext = dp; 1109 dp->dc_lruprev = last; 1110 dp->dc_lrunext = dclru; 1111 } 1112 dclru_len++; 1113 ASSERT(dclru_len == dclru_count()); 1114 } 1115 1116 static void 1117 dclru_sub(struct dcnode *dp) 1118 { 1119 ASSERT(MUTEX_HELD(&dctable_lock)); 1120 dp->dc_lrunext->dc_lruprev = dp->dc_lruprev; 1121 dp->dc_lruprev->dc_lrunext = dp->dc_lrunext; 1122 if (dp == dclru) 1123 dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext; 1124 dp->dc_lrunext = dp->dc_lruprev = NULL; 1125 dclru_len--; 1126 ASSERT(dclru_len == dclru_count()); 1127 } 1128