1 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/thread.h> 42 #include <sys/t_lock.h> 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/bitmap.h> 46 #include <sys/buf.h> 47 #include <sys/cmn_err.h> 48 #include <sys/conf.h> 49 #include <sys/ddi.h> 50 #include <sys/debug.h> 51 #include <sys/errno.h> 52 #include <sys/time.h> 53 #include <sys/fcntl.h> 54 #include <sys/flock.h> 55 #include <sys/file.h> 56 #include <sys/kmem.h> 57 #include <sys/mman.h> 58 #include <sys/vmsystm.h> 59 #include <sys/open.h> 60 #include <sys/swap.h> 61 #include <sys/sysmacros.h> 62 #include <sys/uio.h> 63 #include <sys/vfs.h> 64 #include <sys/vfs_opreg.h> 65 #include <sys/vnode.h> 66 #include <sys/stat.h> 67 #include <sys/poll.h> 68 #include <sys/zmod.h> 69 #include <sys/fs/decomp.h> 70 71 #include <vm/hat.h> 72 #include <vm/as.h> 73 #include <vm/page.h> 74 #include <vm/pvn.h> 75 #include <vm/seg_vn.h> 76 #include <vm/seg_kmem.h> 77 #include <vm/seg_map.h> 78 79 #include <fs/fs_subr.h> 80 81 /* 82 * dcfs - A filesystem for automatic decompressing of fiocompressed files 83 * 84 * This filesystem is a layered filesystem that sits on top of a normal 85 * persistent filesystem and provides automatic decompression of files 86 * that have been previously compressed and stored on the host file system. 87 * This is a pseudo filesystem in that it does not persist data, rather it 88 * intercepts file lookup requests on the host filesystem and provides 89 * transparent decompression of those files. Currently the only supported 90 * host filesystem is ufs. 91 * 92 * A file is compressed via a userland utility (currently cmd/boot/fiocompress) 93 * and marked by fiocompress as a compressed file via a flag in the on-disk 94 * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED 95 * ufs_lookup checks for this flag and if set, passes control to decompvp 96 * a function defined in this (dcfs) filesystem. decomvp uncompresses the file 97 * and returns a dcfs vnode to the VFS layer. 98 * 99 * dcfs is layered on top of ufs and passes requests involving persistence 100 * to the underlying ufs filesystem. The compressed files currently cannot be 101 * written to. 102 */ 103 104 105 /* 106 * Define data structures within this file. 107 */ 108 #define DCSHFT 5 109 #define DCTABLESIZE 16 110 111 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0) 112 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1)) 113 #else 114 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC) 115 #endif 116 117 #define DCLRUSIZE 16 118 119 #define DCCACHESIZE 4 120 121 #define rounddown(x, y) ((x) & ~((y) - 1)) 122 123 struct dcnode *dctable[DCTABLESIZE]; 124 125 struct dcnode *dclru; 126 static int dclru_len; 127 128 kmutex_t dctable_lock; 129 130 dev_t dcdev; 131 struct vfs dc_vfs; 132 133 struct kmem_cache *dcnode_cache; 134 struct kmem_cache *dcbuf_cache[DCCACHESIZE]; 135 136 kmutex_t dccache_lock; 137 138 static int dcinit(int, char *); 139 140 static struct dcnode *dcnode_alloc(void); 141 static void dcnode_free(struct dcnode *); 142 static void dcnode_recycle(struct dcnode *); 143 144 static void dcinsert(struct dcnode *); 145 static void dcdelete(struct dcnode *); 146 static struct dcnode *dcfind(struct vnode *); 147 static void dclru_add(struct dcnode *); 148 static void dclru_sub(struct dcnode *); 149 150 151 /* 152 * This is the loadable module wrapper. 153 */ 154 #include <sys/modctl.h> 155 156 struct vfsops *dc_vfsops; 157 158 static vfsdef_t vfw = { 159 VFSDEF_VERSION, 160 "dcfs", 161 dcinit, 162 0, 163 NULL 164 }; 165 166 /* 167 * Module linkage information for the kernel. 168 */ 169 extern struct mod_ops mod_fsops; 170 171 static struct modlfs modlfs = { 172 &mod_fsops, "compressed filesystem", &vfw 173 }; 174 175 static struct modlinkage modlinkage = { 176 MODREV_1, (void *)&modlfs, NULL 177 }; 178 179 int 180 _init() 181 { 182 return (mod_install(&modlinkage)); 183 } 184 185 int 186 _info(struct modinfo *modinfop) 187 { 188 return (mod_info(&modlinkage, modinfop)); 189 } 190 191 192 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *); 193 static int dc_close(struct vnode *, int, int, offset_t, 194 struct cred *, caller_context_t *); 195 static int dc_read(struct vnode *, struct uio *, int, struct cred *, 196 struct caller_context *); 197 static int dc_getattr(struct vnode *, struct vattr *, int, 198 struct cred *, caller_context_t *); 199 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *, 200 struct caller_context *); 201 static int dc_access(struct vnode *, int, int, 202 struct cred *, caller_context_t *); 203 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *); 204 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *); 205 static int dc_fid(struct vnode *, struct fid *, caller_context_t *); 206 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *); 207 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 208 struct flk_callback *, struct cred *, caller_context_t *); 209 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *, 210 struct page **, size_t, struct seg *, caddr_t, enum seg_rw, 211 struct cred *, caller_context_t *); 212 static int dc_putpage(struct vnode *, offset_t, size_t, int, 213 struct cred *, caller_context_t *); 214 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 215 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 216 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 217 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 218 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 219 uint_t, uint_t, uint_t, struct cred *, caller_context_t *); 220 221 struct vnodeops *dc_vnodeops; 222 223 const fs_operation_def_t dc_vnodeops_template[] = { 224 VOPNAME_OPEN, { .vop_open = dc_open }, 225 VOPNAME_CLOSE, { .vop_close = dc_close }, 226 VOPNAME_READ, { .vop_read = dc_read }, 227 VOPNAME_GETATTR, { .vop_getattr = dc_getattr }, 228 VOPNAME_SETATTR, { .vop_setattr = dc_setattr }, 229 VOPNAME_ACCESS, { .vop_access = dc_access }, 230 VOPNAME_FSYNC, { .vop_fsync = dc_fsync }, 231 VOPNAME_INACTIVE, { .vop_inactive = dc_inactive }, 232 VOPNAME_FID, { .vop_fid = dc_fid }, 233 VOPNAME_SEEK, { .vop_seek = dc_seek }, 234 VOPNAME_FRLOCK, { .vop_frlock = dc_frlock }, 235 VOPNAME_GETPAGE, { .vop_getpage = dc_getpage }, 236 VOPNAME_PUTPAGE, { .vop_putpage = dc_putpage }, 237 VOPNAME_MAP, { .vop_map = dc_map }, 238 VOPNAME_ADDMAP, { .vop_addmap = dc_addmap }, 239 VOPNAME_DELMAP, { .vop_delmap = dc_delmap }, 240 NULL, NULL 241 }; 242 243 /*ARGSUSED*/ 244 static int 245 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp) 246 { 247 return (0); 248 } 249 250 /*ARGSUSED*/ 251 static int 252 dc_close(struct vnode *vp, int flag, int count, offset_t off, 253 struct cred *cr, caller_context_t *ctp) 254 { 255 (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 256 cleanshares(vp, ttoproc(curthread)->p_pid); 257 return (0); 258 } 259 260 /*ARGSUSED*/ 261 static int 262 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 263 struct caller_context *ct) 264 { 265 struct dcnode *dp = VTODC(vp); 266 size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize); 267 size_t fsize = dp->dc_hdr->ch_fsize; 268 int error; 269 270 /* 271 * Loop through file with segmap, decompression will occur 272 * in dc_getapage 273 */ 274 do { 275 caddr_t base; 276 size_t n; 277 offset_t mapon; 278 279 /* 280 * read to end of block or file 281 */ 282 mapon = uiop->uio_loffset & (rdsize - 1); 283 n = MIN(rdsize - mapon, uiop->uio_resid); 284 n = MIN(n, fsize - uiop->uio_loffset); 285 if (n == 0) 286 return (0); /* at EOF */ 287 288 base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1, 289 S_READ); 290 error = uiomove(base + mapon, n, UIO_READ, uiop); 291 if (!error) { 292 uint_t flags; 293 294 if (n + mapon == rdsize || uiop->uio_loffset == fsize) 295 flags = SM_DONTNEED; 296 else 297 flags = 0; 298 error = segmap_release(segkmap, base, flags); 299 } else 300 (void) segmap_release(segkmap, base, 0); 301 } while (!error && uiop->uio_resid); 302 303 return (error); 304 } 305 306 static int 307 dc_getattr(struct vnode *vp, struct vattr *vap, int flags, 308 cred_t *cred, caller_context_t *ctp) 309 { 310 struct dcnode *dp = VTODC(vp); 311 struct vnode *subvp = dp->dc_subvp; 312 int error; 313 314 error = VOP_GETATTR(subvp, vap, flags, cred, ctp); 315 316 /* substitute uncompressed size */ 317 vap->va_size = dp->dc_hdr->ch_fsize; 318 return (error); 319 } 320 321 static int 322 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred, 323 caller_context_t *ctp) 324 { 325 struct dcnode *dp = VTODC(vp); 326 struct vnode *subvp = dp->dc_subvp; 327 328 return (VOP_SETATTR(subvp, vap, flags, cred, ctp)); 329 } 330 331 static int 332 dc_access(struct vnode *vp, int mode, int flags, 333 cred_t *cred, caller_context_t *ctp) 334 { 335 struct dcnode *dp = VTODC(vp); 336 struct vnode *subvp = dp->dc_subvp; 337 338 return (VOP_ACCESS(subvp, mode, flags, cred, ctp)); 339 } 340 341 /*ARGSUSED*/ 342 static int 343 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp) 344 { 345 return (0); 346 } 347 348 /*ARGSUSED*/ 349 static void 350 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp) 351 { 352 struct dcnode *dp = VTODC(vp); 353 354 mutex_enter(&dctable_lock); 355 mutex_enter(&vp->v_lock); 356 ASSERT(vp->v_count >= 1); 357 if (--vp->v_count != 0) { 358 /* 359 * Somebody accessed the dcnode before we got a chance to 360 * remove it. They will remove it when they do a vn_rele. 361 */ 362 mutex_exit(&vp->v_lock); 363 mutex_exit(&dctable_lock); 364 return; 365 } 366 mutex_exit(&vp->v_lock); 367 368 dcnode_free(dp); 369 370 mutex_exit(&dctable_lock); 371 } 372 373 static int 374 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp) 375 { 376 struct dcnode *dp = VTODC(vp); 377 struct vnode *subvp = dp->dc_subvp; 378 379 return (VOP_FID(subvp, fidp, ctp)); 380 } 381 382 static int 383 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp) 384 { 385 struct dcnode *dp = VTODC(vp); 386 struct vnode *subvp = dp->dc_subvp; 387 388 return (VOP_SEEK(subvp, oof, noffp, ctp)); 389 } 390 391 static int 392 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 393 offset_t offset, struct flk_callback *flk_cbp, 394 cred_t *cr, caller_context_t *ctp) 395 { 396 struct dcnode *dp = VTODC(vp); 397 398 /* 399 * If file is being mapped, disallow frlock. 400 */ 401 if (dp->dc_mapcnt > 0) 402 return (EAGAIN); 403 404 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp)); 405 } 406 407 /*ARGSUSED*/ 408 static int 409 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 410 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 411 { 412 struct dcnode *dp = VTODC(vp); 413 struct comphdr *hdr = dp->dc_hdr; 414 struct page *pp; 415 struct buf *bp; 416 caddr_t saddr; 417 off_t cblkno; 418 size_t rdoff, rdsize, dsize; 419 long xlen; 420 int error, zerr; 421 422 ASSERT(len == hdr->ch_blksize); 423 /* 424 * Get destination pages and make them addressable 425 */ 426 pp = page_create_va(vp, off, len, PG_WAIT, seg, addr); 427 bp = pageio_setup(pp, len, vp, B_READ); 428 bp_mapin(bp); 429 430 /* 431 * read compressed data from subordinate vnode 432 */ 433 saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP); 434 cblkno = off / len; 435 rdoff = hdr->ch_blkmap[cblkno]; 436 rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff; 437 error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff, 438 UIO_SYSSPACE, 0, 0, cr, NULL); 439 if (error) 440 goto cleanup; 441 442 /* 443 * Uncompress 444 */ 445 dsize = len; 446 zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax); 447 if (zerr != Z_OK) { 448 error = EIO; 449 goto cleanup; 450 } 451 452 /* 453 * Handle EOF 454 */ 455 xlen = hdr->ch_fsize - off; 456 if (xlen < len) { 457 bzero(bp->b_un.b_addr + xlen, len - xlen); 458 if (dsize != xlen) 459 error = EIO; 460 } else if (dsize != len) 461 error = EIO; 462 463 /* 464 * Clean up 465 */ 466 cleanup: 467 kmem_cache_free(dp->dc_bufcache, saddr); 468 pageio_done(bp); 469 *ppp = pp; 470 return (error); 471 } 472 473 static int 474 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 475 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 476 { 477 struct page *pp, *plist = NULL; 478 offset_t pgoff; 479 int rdblk; 480 481 /* 482 * pvn_read_kluster() doesn't quite do what we want, since it 483 * thinks sub block reads are ok. Here we always decompress 484 * a full block. 485 */ 486 487 /* 488 * Check page cache 489 */ 490 rdblk = 0; 491 for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) { 492 pp = page_lookup(vp, pgoff, SE_EXCL); 493 if (pp == NULL) { 494 rdblk = 1; 495 break; 496 } 497 page_io_lock(pp); 498 page_add(&plist, pp); 499 plist = plist->p_next; 500 } 501 if (!rdblk) { 502 *ppp = plist; 503 return (0); /* all pages in cache */ 504 } 505 506 /* 507 * Undo any locks so getblock_miss has an open field 508 */ 509 if (plist != NULL) 510 pvn_io_done(plist); 511 512 return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr)); 513 } 514 515 /*ARGSUSED10*/ 516 static int 517 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 518 struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr, 519 enum seg_rw rw, struct cred *cr, caller_context_t *ctp) 520 { 521 struct dcnode *dp = VTODC(vp); 522 struct comphdr *hdr = dp->dc_hdr; 523 struct page *pp, *plist = NULL; 524 caddr_t vp_baddr; 525 offset_t vp_boff, vp_bend; 526 size_t bsize = hdr->ch_blksize; 527 int nblks, error; 528 529 /* does not support write */ 530 if (rw == S_WRITE) { 531 panic("write attempt on compressed file"); 532 /*NOTREACHED*/ 533 } 534 535 if (protp) 536 *protp = PROT_ALL; 537 /* 538 * We don't support asynchronous operation at the moment, so 539 * just pretend we did it. If the pages are ever actually 540 * needed, they'll get brought in then. 541 */ 542 if (pl == NULL) 543 return (0); 544 545 /* 546 * Calc block start and end offsets 547 */ 548 vp_boff = rounddown(off, bsize); 549 vp_bend = roundup(off + len, bsize); 550 vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize); 551 552 nblks = (vp_bend - vp_boff) / bsize; 553 while (nblks--) { 554 error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr, 555 rw, cr); 556 page_list_concat(&plist, &pp); 557 vp_boff += bsize; 558 vp_baddr += bsize; 559 } 560 if (!error) 561 pvn_plist_init(plist, pl, plsz, off, len, rw); 562 else 563 pvn_read_done(plist, B_ERROR); 564 return (error); 565 } 566 567 /* 568 * This function should never be called. We need to have it to pass 569 * it as an argument to other functions. 570 */ 571 /*ARGSUSED*/ 572 static int 573 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp, 574 int flags, struct cred *cr) 575 { 576 /* should never happen */ 577 cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page"); 578 /*NOTREACHED*/ 579 return (0); 580 } 581 582 583 /* 584 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED. 585 * B_INVAL is set by: 586 * 587 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag. 588 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice 589 * which translates to an MC_SYNC with the MS_INVALIDATE flag. 590 * 591 * The B_FREE (as well as the B_DONTNEED) flag is set when the 592 * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked 593 * from SEGVN to release pages behind a pagefault. 594 */ 595 /*ARGSUSED5*/ 596 static int 597 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags, 598 struct cred *cr, caller_context_t *ctp) 599 { 600 int error = 0; 601 602 if (vp->v_count == 0) { 603 panic("dcfs_putpage: bad v_count"); 604 /*NOTREACHED*/ 605 } 606 607 if (vp->v_flag & VNOMAP) 608 return (ENOSYS); 609 610 if (!vn_has_cached_data(vp)) /* no pages mapped */ 611 return (0); 612 613 if (len == 0) /* from 'off' to EOF */ 614 error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr); 615 else { 616 offset_t io_off; 617 se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 618 619 for (io_off = off; io_off < off + len; io_off += PAGESIZE) { 620 page_t *pp; 621 622 /* 623 * We insist on getting the page only if we are 624 * about to invalidate, free or write it and 625 * the B_ASYNC flag is not set. 626 */ 627 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) 628 pp = page_lookup(vp, io_off, se); 629 else 630 pp = page_lookup_nowait(vp, io_off, se); 631 632 if (pp == NULL) 633 continue; 634 /* 635 * Normally pvn_getdirty() should return 0, which 636 * impies that it has done the job for us. 637 * The shouldn't-happen scenario is when it returns 1. 638 * This means that the page has been modified and 639 * needs to be put back. 640 * Since we can't write to a dcfs compressed file, 641 * we fake a failed I/O and force pvn_write_done() 642 * to destroy the page. 643 */ 644 if (pvn_getdirty(pp, flags) == 1) { 645 cmn_err(CE_NOTE, "dc_putpage: dirty page"); 646 pvn_write_done(pp, flags | 647 B_ERROR | B_WRITE | B_INVAL | B_FORCE); 648 } 649 } 650 } 651 return (error); 652 } 653 654 static int 655 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp, 656 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 657 struct cred *cred, caller_context_t *ctp) 658 { 659 struct vattr vattr; 660 struct segvn_crargs vn_a; 661 int error; 662 663 if (vp->v_flag & VNOMAP) 664 return (ENOSYS); 665 666 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) 667 return (ENXIO); 668 669 /* 670 * If file is being locked, disallow mapping. 671 */ 672 if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp)) 673 return (error); 674 if (vn_has_mandatory_locks(vp, vattr.va_mode)) 675 return (EAGAIN); 676 677 as_rangelock(as); 678 679 if ((flags & MAP_FIXED) == 0) { 680 map_addr(addrp, len, off, 1, flags); 681 if (*addrp == NULL) { 682 as_rangeunlock(as); 683 return (ENOMEM); 684 } 685 } else { 686 /* 687 * User specified address - blow away any previous mappings 688 */ 689 (void) as_unmap(as, *addrp, len); 690 } 691 692 vn_a.vp = vp; 693 vn_a.offset = off; 694 vn_a.type = flags & MAP_TYPE; 695 vn_a.prot = prot; 696 vn_a.maxprot = maxprot; 697 vn_a.flags = flags & ~MAP_TYPE; 698 vn_a.cred = cred; 699 vn_a.amp = NULL; 700 vn_a.szc = 0; 701 vn_a.lgrp_mem_policy_flags = 0; 702 703 error = as_map(as, *addrp, len, segvn_create, &vn_a); 704 as_rangeunlock(as); 705 return (error); 706 } 707 708 /*ARGSUSED*/ 709 static int 710 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 711 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 712 struct cred *cr, caller_context_t *ctp) 713 { 714 struct dcnode *dp; 715 716 if (vp->v_flag & VNOMAP) 717 return (ENOSYS); 718 719 dp = VTODC(vp); 720 mutex_enter(&dp->dc_lock); 721 dp->dc_mapcnt += btopr(len); 722 mutex_exit(&dp->dc_lock); 723 return (0); 724 } 725 726 /*ARGSUSED*/ 727 static int 728 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 729 size_t len, uint_t prot, uint_t maxprot, uint_t flags, 730 struct cred *cr, caller_context_t *ctp) 731 { 732 struct dcnode *dp; 733 734 if (vp->v_flag & VNOMAP) 735 return (ENOSYS); 736 737 dp = VTODC(vp); 738 mutex_enter(&dp->dc_lock); 739 dp->dc_mapcnt -= btopr(len); 740 ASSERT(dp->dc_mapcnt >= 0); 741 mutex_exit(&dp->dc_lock); 742 return (0); 743 } 744 745 /* 746 * Constructor/destructor routines for dcnodes 747 */ 748 /*ARGSUSED1*/ 749 static int 750 dcnode_constructor(void *buf, void *cdrarg, int kmflags) 751 { 752 struct dcnode *dp = buf; 753 struct vnode *vp; 754 755 vp = dp->dc_vp = vn_alloc(kmflags); 756 if (vp == NULL) { 757 return (-1); 758 } 759 vp->v_data = dp; 760 vp->v_type = VREG; 761 vp->v_flag = VNOSWAP; 762 vp->v_vfsp = &dc_vfs; 763 vn_setops(vp, dc_vnodeops); 764 vn_exists(vp); 765 766 mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL); 767 dp->dc_mapcnt = 0; 768 dp->dc_lrunext = dp->dc_lruprev = NULL; 769 dp->dc_hdr = NULL; 770 dp->dc_subvp = NULL; 771 return (0); 772 } 773 774 /*ARGSUSED*/ 775 static void 776 dcnode_destructor(void *buf, void *cdrarg) 777 { 778 struct dcnode *dp = buf; 779 struct vnode *vp = DCTOV(dp); 780 781 mutex_destroy(&dp->dc_lock); 782 783 VERIFY(dp->dc_hdr == NULL); 784 VERIFY(dp->dc_subvp == NULL); 785 vn_invalid(vp); 786 vn_free(vp); 787 } 788 789 static struct dcnode * 790 dcnode_alloc(void) 791 { 792 struct dcnode *dp; 793 794 /* 795 * If the free list is above DCLRUSIZE 796 * re-use one from it 797 */ 798 mutex_enter(&dctable_lock); 799 if (dclru_len < DCLRUSIZE) { 800 mutex_exit(&dctable_lock); 801 dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP); 802 } else { 803 ASSERT(dclru != NULL); 804 dp = dclru; 805 dclru_sub(dp); 806 dcdelete(dp); 807 mutex_exit(&dctable_lock); 808 dcnode_recycle(dp); 809 } 810 return (dp); 811 } 812 813 static void 814 dcnode_free(struct dcnode *dp) 815 { 816 struct vnode *vp = DCTOV(dp); 817 818 ASSERT(MUTEX_HELD(&dctable_lock)); 819 820 /* 821 * If no cached pages, no need to put it on lru 822 */ 823 if (!vn_has_cached_data(vp)) { 824 dcdelete(dp); 825 dcnode_recycle(dp); 826 kmem_cache_free(dcnode_cache, dp); 827 return; 828 } 829 830 /* 831 * Add to lru, if it's over the limit, free from head 832 */ 833 dclru_add(dp); 834 if (dclru_len > DCLRUSIZE) { 835 dp = dclru; 836 dclru_sub(dp); 837 dcdelete(dp); 838 dcnode_recycle(dp); 839 kmem_cache_free(dcnode_cache, dp); 840 } 841 } 842 843 static void 844 dcnode_recycle(struct dcnode *dp) 845 { 846 struct vnode *vp; 847 848 vp = DCTOV(dp); 849 850 VN_RELE(dp->dc_subvp); 851 dp->dc_subvp = NULL; 852 (void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL); 853 kmem_free(dp->dc_hdr, dp->dc_hdrsize); 854 dp->dc_hdr = NULL; 855 dp->dc_hdrsize = dp->dc_zmax = 0; 856 dp->dc_bufcache = NULL; 857 dp->dc_mapcnt = 0; 858 vn_reinit(vp); 859 vp->v_type = VREG; 860 vp->v_flag = VNOSWAP; 861 vp->v_vfsp = &dc_vfs; 862 } 863 864 static int 865 dcinit(int fstype, char *name) 866 { 867 static const fs_operation_def_t dc_vfsops_template[] = { 868 NULL, NULL 869 }; 870 int error; 871 major_t dev; 872 873 error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops); 874 if (error) { 875 cmn_err(CE_WARN, "dcinit: bad vfs ops template"); 876 return (error); 877 } 878 VFS_INIT(&dc_vfs, dc_vfsops, NULL); 879 dc_vfs.vfs_flag = VFS_RDONLY; 880 dc_vfs.vfs_fstype = fstype; 881 if ((dev = getudev()) == (major_t)-1) 882 dev = 0; 883 dcdev = makedevice(dev, 0); 884 dc_vfs.vfs_dev = dcdev; 885 886 error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops); 887 if (error != 0) { 888 (void) vfs_freevfsops_by_type(fstype); 889 cmn_err(CE_WARN, "dcinit: bad vnode ops template"); 890 return (error); 891 } 892 893 mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL); 894 mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL); 895 dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode), 896 0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0); 897 898 return (0); 899 } 900 901 /* 902 * Return shadow vnode with the given vp as its subordinate 903 */ 904 struct vnode * 905 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp) 906 { 907 struct dcnode *dp, *ndp; 908 struct comphdr thdr, *hdr; 909 struct kmem_cache **cpp; 910 struct vattr vattr; 911 size_t hdrsize, bsize; 912 int error; 913 914 /* 915 * See if we have an existing shadow 916 * If none, we have to manufacture one 917 */ 918 mutex_enter(&dctable_lock); 919 dp = dcfind(vp); 920 mutex_exit(&dctable_lock); 921 if (dp != NULL) 922 return (DCTOV(dp)); 923 924 /* 925 * Make sure it's a valid compressed file 926 */ 927 hdr = &thdr; 928 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0, 929 UIO_SYSSPACE, 0, 0, cred, NULL); 930 if (error || hdr->ch_magic != CH_MAGIC_ZLIB || 931 hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB || 932 hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE || 933 hdr->ch_blksize > ptob(DCCACHESIZE) || 934 (hdr->ch_blksize & (hdr->ch_blksize - 1)) != 0) 935 return (NULL); 936 937 /* get underlying file size */ 938 if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0) 939 return (NULL); 940 941 /* 942 * Re-read entire header 943 */ 944 hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t); 945 hdr = kmem_alloc(hdrsize, KM_SLEEP); 946 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE, 947 0, 0, cred, NULL); 948 if (error) { 949 kmem_free(hdr, hdrsize); 950 return (NULL); 951 } 952 953 /* 954 * add extra blkmap entry to make dc_getblock()'s 955 * life easier 956 */ 957 bsize = hdr->ch_blksize; 958 hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size; 959 960 ndp = dcnode_alloc(); 961 ndp->dc_subvp = vp; 962 VN_HOLD(vp); 963 ndp->dc_hdr = hdr; 964 ndp->dc_hdrsize = hdrsize; 965 966 /* 967 * Allocate kmem cache if none there already 968 */ 969 ndp->dc_zmax = ZMAXBUF(bsize); 970 cpp = &dcbuf_cache[btop(bsize)]; 971 mutex_enter(&dccache_lock); 972 if (*cpp == NULL) 973 *cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL, 974 NULL, NULL, NULL, NULL, 0); 975 mutex_exit(&dccache_lock); 976 ndp->dc_bufcache = *cpp; 977 978 /* 979 * Recheck table in case someone else created shadow 980 * while we were blocked above. 981 */ 982 mutex_enter(&dctable_lock); 983 dp = dcfind(vp); 984 if (dp != NULL) { 985 mutex_exit(&dctable_lock); 986 dcnode_recycle(ndp); 987 kmem_cache_free(dcnode_cache, ndp); 988 return (DCTOV(dp)); 989 } 990 dcinsert(ndp); 991 mutex_exit(&dctable_lock); 992 993 return (DCTOV(ndp)); 994 } 995 996 997 /* 998 * dcnode lookup table 999 * These routines maintain a table of dcnodes hashed by their 1000 * subordinate vnode so that they can be found if they already 1001 * exist in the vnode cache 1002 */ 1003 1004 /* 1005 * Put a dcnode in the table. 1006 */ 1007 static void 1008 dcinsert(struct dcnode *newdp) 1009 { 1010 int idx = DCHASH(newdp->dc_subvp); 1011 1012 ASSERT(MUTEX_HELD(&dctable_lock)); 1013 newdp->dc_hash = dctable[idx]; 1014 dctable[idx] = newdp; 1015 } 1016 1017 /* 1018 * Remove a dcnode from the hash table. 1019 */ 1020 void 1021 dcdelete(struct dcnode *deldp) 1022 { 1023 int idx = DCHASH(deldp->dc_subvp); 1024 struct dcnode *dp, *prevdp; 1025 1026 ASSERT(MUTEX_HELD(&dctable_lock)); 1027 dp = dctable[idx]; 1028 if (dp == deldp) 1029 dctable[idx] = dp->dc_hash; 1030 else { 1031 for (prevdp = dp, dp = dp->dc_hash; dp != NULL; 1032 prevdp = dp, dp = dp->dc_hash) { 1033 if (dp == deldp) { 1034 prevdp->dc_hash = dp->dc_hash; 1035 break; 1036 } 1037 } 1038 } 1039 ASSERT(dp != NULL); 1040 } 1041 1042 /* 1043 * Find a shadow vnode in the dctable hash list. 1044 */ 1045 static struct dcnode * 1046 dcfind(struct vnode *vp) 1047 { 1048 struct dcnode *dp; 1049 1050 ASSERT(MUTEX_HELD(&dctable_lock)); 1051 for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash) 1052 if (dp->dc_subvp == vp) { 1053 VN_HOLD(DCTOV(dp)); 1054 if (dp->dc_lrunext) 1055 dclru_sub(dp); 1056 return (dp); 1057 } 1058 return (NULL); 1059 } 1060 1061 #ifdef DEBUG 1062 static int 1063 dclru_count(void) 1064 { 1065 struct dcnode *dp; 1066 int i = 0; 1067 1068 if (dclru == NULL) 1069 return (0); 1070 for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext) 1071 i++; 1072 return (i + 1); 1073 } 1074 #endif 1075 1076 static void 1077 dclru_add(struct dcnode *dp) 1078 { 1079 /* 1080 * Add to dclru as double-link chain 1081 */ 1082 ASSERT(MUTEX_HELD(&dctable_lock)); 1083 if (dclru == NULL) { 1084 dclru = dp; 1085 dp->dc_lruprev = dp->dc_lrunext = dp; 1086 } else { 1087 struct dcnode *last = dclru->dc_lruprev; 1088 1089 dclru->dc_lruprev = dp; 1090 last->dc_lrunext = dp; 1091 dp->dc_lruprev = last; 1092 dp->dc_lrunext = dclru; 1093 } 1094 dclru_len++; 1095 ASSERT(dclru_len == dclru_count()); 1096 } 1097 1098 static void 1099 dclru_sub(struct dcnode *dp) 1100 { 1101 ASSERT(MUTEX_HELD(&dctable_lock)); 1102 dp->dc_lrunext->dc_lruprev = dp->dc_lruprev; 1103 dp->dc_lruprev->dc_lrunext = dp->dc_lrunext; 1104 if (dp == dclru) 1105 dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext; 1106 dp->dc_lrunext = dp->dc_lruprev = NULL; 1107 dclru_len--; 1108 ASSERT(dclru_len == dclru_count()); 1109 } 1110