1 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 #include <sys/types.h> 44 #include <sys/thread.h> 45 #include <sys/t_lock.h> 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/bitmap.h> 49 #include <sys/buf.h> 50 #include <sys/cmn_err.h> 51 #include <sys/conf.h> 52 #include <sys/ddi.h> 53 #include <sys/debug.h> 54 #include <sys/errno.h> 55 #include <sys/time.h> 56 #include <sys/fcntl.h> 57 #include <sys/flock.h> 58 #include <sys/file.h> 59 #include <sys/kmem.h> 60 #include <sys/mman.h> 61 #include <sys/vmsystm.h> 62 #include <sys/open.h> 63 #include <sys/swap.h> 64 #include <sys/sysmacros.h> 65 #include <sys/uio.h> 66 #include <sys/vfs.h> 67 #include <sys/vfs_opreg.h> 68 #include <sys/vnode.h> 69 #include <sys/stat.h> 70 #include <sys/poll.h> 71 #include <sys/zmod.h> 72 #include <sys/fs/decomp.h> 73 74 #include <vm/hat.h> 75 #include <vm/as.h> 76 #include <vm/page.h> 77 #include <vm/pvn.h> 78 #include <vm/seg_vn.h> 79 #include <vm/seg_kmem.h> 80 #include <vm/seg_map.h> 81 82 #include <fs/fs_subr.h> 83 84 /* 85 * dcfs - A filesystem for automatic decompressing of fiocompressed files 86 * 87 * This filesystem is a layered filesystem that sits on top of a normal 88 * persistent filesystem and provides automatic decompression of files 89 * that have been previously compressed and stored on the host file system. 90 * This is a pseudo filesystem in that it does not persist data, rather it 91 * intercepts file lookup requests on the host filesystem and provides 92 * transparent decompression of those files. Currently the only supported 93 * host filesystem is ufs. 94 * 95 * A file is compressed via a userland utility (currently cmd/boot/fiocompress) 96 * and marked by fiocompress as a compressed file via a flag in the on-disk 97 * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED 98 * ufs_lookup checks for this flag and if set, passes control to decompvp 99 * a function defined in this (dcfs) filesystem. decomvp uncompresses the file 100 * and returns a dcfs vnode to the VFS layer. 101 * 102 * dcfs is layered on top of ufs and passes requests involving persistence 103 * to the underlying ufs filesystem. The compressed files currently cannot be 104 * written to. 105 */ 106 107 108 /* 109 * Define data structures within this file. 110 */ 111 #define DCSHFT 5 112 #define DCTABLESIZE 16 113 114 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0) 115 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1)) 116 #else 117 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC) 118 #endif 119 120 #define DCLRUSIZE 16 121 122 #define DCCACHESIZE 4 123 124 #define rounddown(x, y) ((x) & ~((y) - 1)) 125 126 struct dcnode *dctable[DCTABLESIZE]; 127 128 struct dcnode *dclru; 129 static int dclru_len; 130 131 kmutex_t dctable_lock; 132 133 dev_t dcdev; 134 struct vfs dc_vfs; 135 136 struct kmem_cache *dcnode_cache; 137 struct kmem_cache *dcbuf_cache[DCCACHESIZE]; 138 139 kmutex_t dccache_lock; 140 141 static int dcinit(int, char *); 142 143 static struct dcnode *dcnode_alloc(void); 144 static void dcnode_free(struct dcnode *); 145 static void dcnode_recycle(struct dcnode *); 146 147 static void dcinsert(struct dcnode *); 148 static void dcdelete(struct dcnode *); 149 static struct dcnode *dcfind(struct vnode *); 150 static void dclru_add(struct dcnode *); 151 static void dclru_sub(struct dcnode *); 152 153 154 /* 155 * This is the loadable module wrapper. 156 */ 157 #include <sys/modctl.h> 158 159 struct vfsops *dc_vfsops; 160 161 static vfsdef_t vfw = { 162 VFSDEF_VERSION, 163 "dcfs", 164 dcinit, 165 0, 166 NULL 167 }; 168 169 /* 170 * Module linkage information for the kernel. 171 */ 172 extern struct mod_ops mod_fsops; 173 174 static struct modlfs modlfs = { 175 &mod_fsops, "compressed filesystem", &vfw 176 }; 177 178 static struct modlinkage modlinkage = { 179 MODREV_1, (void *)&modlfs, NULL 180 }; 181 182 int 183 _init() 184 { 185 return (mod_install(&modlinkage)); 186 } 187 188 int 189 _info(struct modinfo *modinfop) 190 { 191 return (mod_info(&modlinkage, modinfop)); 192 } 193 194 195 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *); 196 static int dc_close(struct vnode *, int, int, offset_t, 197 struct cred *, caller_context_t *); 198 static int dc_read(struct vnode *, struct uio *, int, struct cred *, 199 struct caller_context *); 200 static int dc_getattr(struct vnode *, struct vattr *, int, 201 struct cred *, caller_context_t *); 202 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *, 203 struct caller_context *); 204 static int dc_access(struct vnode *, int, int, 205 struct cred *, caller_context_t *); 206 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *); 207 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *); 208 static int dc_fid(struct vnode *, struct fid *, caller_context_t *); 209 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *); 210 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 211 struct flk_callback *, struct cred *, caller_context_t *); 212 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *, 213 struct page **, size_t, struct seg *, caddr_t, enum seg_rw, 214 struct cred *, caller_context_t *); 215 static int dc_putpage(struct vnode *, offset_t, size_t, int, 216 struct cred *, caller_context_t *); 217 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 218 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 219 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 220 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 221 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 222 uint_t, uint_t, uint_t, struct cred *, caller_context_t *); 223 224 struct vnodeops *dc_vnodeops; 225 226 const fs_operation_def_t dc_vnodeops_template[] = { 227 VOPNAME_OPEN, { .vop_open = dc_open }, 228 VOPNAME_CLOSE, { .vop_close = dc_close }, 229 VOPNAME_READ, { .vop_read = dc_read }, 230 VOPNAME_GETATTR, { .vop_getattr = dc_getattr }, 231 VOPNAME_SETATTR, { .vop_setattr = dc_setattr }, 232 VOPNAME_ACCESS, { .vop_access = dc_access }, 233 VOPNAME_FSYNC, { .vop_fsync = dc_fsync }, 234 VOPNAME_INACTIVE, { .vop_inactive = dc_inactive }, 235 VOPNAME_FID, { .vop_fid = dc_fid }, 236 VOPNAME_SEEK, { .vop_seek = dc_seek }, 237 VOPNAME_FRLOCK, { .vop_frlock = dc_frlock }, 238 VOPNAME_GETPAGE, { .vop_getpage = dc_getpage }, 239 VOPNAME_PUTPAGE, { .vop_putpage = dc_putpage }, 240 VOPNAME_MAP, { .vop_map = dc_map }, 241 VOPNAME_ADDMAP, { .vop_addmap = dc_addmap }, 242 VOPNAME_DELMAP, { .vop_delmap = dc_delmap }, 243 NULL, NULL 244 }; 245 246 /*ARGSUSED*/ 247 static int 248 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp) 249 { 250 return (0); 251 } 252 253 /*ARGSUSED*/ 254 static int 255 dc_close(struct vnode *vp, int flag, int count, offset_t off, 256 struct cred *cr, caller_context_t *ctp) 257 { 258 (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 259 cleanshares(vp, ttoproc(curthread)->p_pid); 260 return (0); 261 } 262 263 /*ARGSUSED*/ 264 static int 265 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 266 struct caller_context *ct) 267 { 268 struct dcnode *dp = VTODC(vp); 269 size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize); 270 size_t fsize = dp->dc_hdr->ch_fsize; 271 int error; 272 273 /* 274 * Loop through file with segmap, decompression will occur 275 * in dc_getapage 276 */ 277 do { 278 caddr_t base; 279 size_t n; 280 offset_t mapon; 281 282 /* 283 * read to end of block or file 284 */ 285 mapon = uiop->uio_loffset & (rdsize - 1); 286 n = MIN(rdsize - mapon, uiop->uio_resid); 287 n = MIN(n, fsize - uiop->uio_loffset); 288 if (n == 0) 289 return (0); /* at EOF */ 290 291 base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1, 292 S_READ); 293 error = uiomove(base + mapon, n, UIO_READ, uiop); 294 if (!error) { 295 uint_t flags; 296 297 if (n + mapon == rdsize || uiop->uio_loffset == fsize) 298 flags = SM_DONTNEED; 299 else 300 flags = 0; 301 error = segmap_release(segkmap, base, flags); 302 } else 303 (void) segmap_release(segkmap, base, 0); 304 } while (!error && uiop->uio_resid); 305 306 return (error); 307 } 308 309 static int 310 dc_getattr(struct vnode *vp, struct vattr *vap, int flags, 311 cred_t *cred, caller_context_t *ctp) 312 { 313 struct dcnode *dp = VTODC(vp); 314 struct vnode *subvp = dp->dc_subvp; 315 int error; 316 317 error = VOP_GETATTR(subvp, vap, flags, cred, ctp); 318 319 /* substitute uncompressed size */ 320 vap->va_size = dp->dc_hdr->ch_fsize; 321 return (error); 322 } 323 324 static int 325 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred, 326 caller_context_t *ctp) 327 { 328 struct dcnode *dp = VTODC(vp); 329 struct vnode *subvp = dp->dc_subvp; 330 331 return (VOP_SETATTR(subvp, vap, flags, cred, ctp)); 332 } 333 334 static int 335 dc_access(struct vnode *vp, int mode, int flags, 336 cred_t *cred, caller_context_t *ctp) 337 { 338 struct dcnode *dp = VTODC(vp); 339 struct vnode *subvp = dp->dc_subvp; 340 341 return (VOP_ACCESS(subvp, mode, flags, cred, ctp)); 342 } 343 344 /*ARGSUSED*/ 345 static int 346 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp) 347 { 348 return (0); 349 } 350 351 /*ARGSUSED*/ 352 static void 353 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp) 354 { 355 struct dcnode *dp = VTODC(vp); 356 357 mutex_enter(&dctable_lock); 358 mutex_enter(&vp->v_lock); 359 ASSERT(vp->v_count >= 1); 360 if (--vp->v_count != 0) { 361 /* 362 * Somebody accessed the dcnode before we got a chance to 363 * remove it. They will remove it when they do a vn_rele. 364 */ 365 mutex_exit(&vp->v_lock); 366 mutex_exit(&dctable_lock); 367 return; 368 } 369 mutex_exit(&vp->v_lock); 370 371 dcnode_free(dp); 372 373 mutex_exit(&dctable_lock); 374 } 375 376 static int 377 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp) 378 { 379 struct dcnode *dp = VTODC(vp); 380 struct vnode *subvp = dp->dc_subvp; 381 382 return (VOP_FID(subvp, fidp, ctp)); 383 } 384 385 static int 386 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp) 387 { 388 struct dcnode *dp = VTODC(vp); 389 struct vnode *subvp = dp->dc_subvp; 390 391 return (VOP_SEEK(subvp, oof, noffp, ctp)); 392 } 393 394 static int 395 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 396 offset_t offset, struct flk_callback *flk_cbp, 397 cred_t *cr, caller_context_t *ctp) 398 { 399 struct dcnode *dp = VTODC(vp); 400 401 /* 402 * If file is being mapped, disallow frlock. 403 */ 404 if (dp->dc_mapcnt > 0) 405 return (EAGAIN); 406 407 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp)); 408 } 409 410 /*ARGSUSED*/ 411 static int 412 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 413 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 414 { 415 struct dcnode *dp = VTODC(vp); 416 struct comphdr *hdr = dp->dc_hdr; 417 struct page *pp; 418 struct buf *bp; 419 caddr_t saddr; 420 off_t cblkno; 421 size_t rdoff, rdsize, dsize; 422 long xlen; 423 int error, zerr; 424 425 ASSERT(len == hdr->ch_blksize); 426 /* 427 * Get destination pages and make them addressable 428 */ 429 pp = page_create_va(vp, off, len, PG_WAIT, seg, addr); 430 bp = pageio_setup(pp, len, vp, B_READ); 431 bp_mapin(bp); 432 433 /* 434 * read compressed data from subordinate vnode 435 */ 436 saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP); 437 cblkno = off / len; 438 rdoff = hdr->ch_blkmap[cblkno]; 439 rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff; 440 error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff, 441 UIO_SYSSPACE, 0, 0, cr, NULL); 442 if (error) 443 goto cleanup; 444 445 /* 446 * Uncompress 447 */ 448 dsize = len; 449 zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax); 450 if (zerr != Z_OK) { 451 error = EIO; 452 goto cleanup; 453 } 454 455 /* 456 * Handle EOF 457 */ 458 xlen = hdr->ch_fsize - off; 459 if (xlen < len) { 460 bzero(bp->b_un.b_addr + xlen, len - xlen); 461 if (dsize != xlen) 462 error = EIO; 463 } else if (dsize != len) 464 error = EIO; 465 466 /* 467 * Clean up 468 */ 469 cleanup: 470 kmem_cache_free(dp->dc_bufcache, saddr); 471 pageio_done(bp); 472 *ppp = pp; 473 return (error); 474 } 475 476 static int 477 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 478 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 479 { 480 struct page *pp, *plist = NULL; 481 offset_t pgoff; 482 int rdblk; 483 484 /* 485 * pvn_read_kluster() doesn't quite do what we want, since it 486 * thinks sub block reads are ok. Here we always decompress 487 * a full block. 488 */ 489 490 /* 491 * Check page cache 492 */ 493 rdblk = 0; 494 for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) { 495 pp = page_lookup(vp, pgoff, SE_EXCL); 496 if (pp == NULL) { 497 rdblk = 1; 498 break; 499 } 500 page_io_lock(pp); 501 page_add(&plist, pp); 502 plist = plist->p_next; 503 } 504 if (!rdblk) { 505 *ppp = plist; 506 return (0); /* all pages in cache */ 507 } 508 509 /* 510 * Undo any locks so getblock_miss has an open field 511 */ 512 if (plist != NULL) 513 pvn_io_done(plist); 514 515 return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr)); 516 } 517 518 /*ARGSUSED10*/ 519 static int 520 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 521 struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr, 522 enum seg_rw rw, struct cred *cr, caller_context_t *ctp) 523 { 524 struct dcnode *dp = VTODC(vp); 525 struct comphdr *hdr = dp->dc_hdr; 526 struct page *pp, *plist = NULL; 527 caddr_t vp_baddr; 528 offset_t vp_boff, vp_bend; 529 size_t bsize = hdr->ch_blksize; 530 int nblks, error; 531 532 /* does not support write */ 533 if (rw == S_WRITE) { 534 panic("write attempt on compressed file"); 535 /*NOTREACHED*/ 536 } 537 538 if (protp) 539 *protp = PROT_ALL; 540 /* 541 * We don't support asynchronous operation at the moment, so 542 * just pretend we did it. If the pages are ever actually 543 * needed, they'll get brought in then. 544 */ 545 if (pl == NULL) 546 return (0); 547 548 /* 549 * Calc block start and end offsets 550 */ 551 vp_boff = rounddown(off, bsize); 552 vp_bend = roundup(off + len, bsize); 553 vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize); 554 555 nblks = (vp_bend - vp_boff) / bsize; 556 while (nblks--) { 557 error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr, 558 rw, cr); 559 page_list_concat(&plist, &pp); 560 vp_boff += bsize; 561 vp_baddr += bsize; 562 } 563 if (!error) 564 pvn_plist_init(plist, pl, plsz, off, len, rw); 565 else 566 pvn_read_done(plist, B_ERROR); 567 return (error); 568 } 569 570 /* 571 * This function should never be called. We need to have it to pass 572 * it as an argument to other functions. 573 */ 574 /*ARGSUSED*/ 575 static int 576 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp, 577 int flags, struct cred *cr) 578 { 579 /* should never happen */ 580 cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page"); 581 /*NOTREACHED*/ 582 return (0); 583 } 584 585 586 /* 587 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED. 588 * B_INVAL is set by: 589 * 590 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag. 591 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice 592 * which translates to an MC_SYNC with the MS_INVALIDATE flag. 593 * 594 * The B_FREE (as well as the B_DONTNEED) flag is set when the 595 * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked 596 * from SEGVN to release pages behind a pagefault. 597 */ 598 /*ARGSUSED5*/ 599 static int 600 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags, 601 struct cred *cr, caller_context_t *ctp) 602 { 603 int error = 0; 604 605 if (vp->v_count == 0) { 606 panic("dcfs_putpage: bad v_count"); 607 /*NOTREACHED*/ 608 } 609 610 if (vp->v_flag & VNOMAP) 611 return (ENOSYS); 612 613 if (!vn_has_cached_data(vp)) /* no pages mapped */ 614 return (0); 615 616 if (len == 0) /* from 'off' to EOF */ 617 error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr); 618 else { 619 offset_t io_off; 620 se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 621 622 for (io_off = off; io_off < off + len; io_off += PAGESIZE) { 623 page_t *pp; 624 625 /* 626 * We insist on getting the page only if we are 627 * about to invalidate, free or write it and 628 * the B_ASYNC flag is not set. 629 */ 630 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) 631 pp = page_lookup(vp, io_off, se); 632 else 633 pp = page_lookup_nowait(vp, io_off, se); 634 635 if (pp == NULL) 636 continue; 637 /* 638 * Normally pvn_getdirty() should return 0, which 639 * impies that it has done the job for us. 640 * The shouldn't-happen scenario is when it returns 1. 641 * This means that the page has been modified and 642 * needs to be put back. 643 * Since we can't write to a dcfs compressed file, 644 * we fake a failed I/O and force pvn_write_done() 645 * to destroy the page. 646 */ 647 if (pvn_getdirty(pp, flags) == 1) { 648 cmn_err(CE_NOTE, "dc_putpage: dirty page"); 649 pvn_write_done(pp, flags | 650 B_ERROR | B_WRITE | B_INVAL | B_FORCE); 651 } 652 } 653 } 654 return (error); 655 } 656 657 static int 658 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp, 659 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 660 struct cred *cred, caller_context_t *ctp) 661 { 662 struct vattr vattr; 663 struct segvn_crargs vn_a; 664 int error; 665 666 if (vp->v_flag & VNOMAP) 667 return (ENOSYS); 668 669 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) 670 return (ENXIO); 671 672 /* 673 * If file is being locked, disallow mapping. 674 */ 675 if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp)) 676 return (error); 677 if (vn_has_mandatory_locks(vp, vattr.va_mode)) 678 return (EAGAIN); 679 680 as_rangelock(as); 681 682 if ((flags & MAP_FIXED) == 0) { 683 map_addr(addrp, len, off, 1, flags); 684 if (*addrp == NULL) { 685 as_rangeunlock(as); 686 return (ENOMEM); 687 } 688 } else { 689 /* 690 * User specified address - blow away any previous mappings 691 */ 692 (void) as_unmap(as, *addrp, len); 693 } 694 695 vn_a.vp = vp; 696 vn_a.offset = off; 697 vn_a.type = flags & MAP_TYPE; 698 vn_a.prot = prot; 699 vn_a.maxprot = maxprot; 700 vn_a.flags = flags & ~MAP_TYPE; 701 vn_a.cred = cred; 702 vn_a.amp = NULL; 703 vn_a.szc = 0; 704 vn_a.lgrp_mem_policy_flags = 0; 705 706 error = as_map(as, *addrp, len, segvn_create, &vn_a); 707 as_rangeunlock(as); 708 return (error); 709 } 710 711 /*ARGSUSED*/ 712 static int 713 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 714 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 715 struct cred *cr, caller_context_t *ctp) 716 { 717 struct dcnode *dp; 718 719 if (vp->v_flag & VNOMAP) 720 return (ENOSYS); 721 722 dp = VTODC(vp); 723 mutex_enter(&dp->dc_lock); 724 dp->dc_mapcnt += btopr(len); 725 mutex_exit(&dp->dc_lock); 726 return (0); 727 } 728 729 /*ARGSUSED*/ 730 static int 731 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 732 size_t len, uint_t prot, uint_t maxprot, uint_t flags, 733 struct cred *cr, caller_context_t *ctp) 734 { 735 struct dcnode *dp; 736 737 if (vp->v_flag & VNOMAP) 738 return (ENOSYS); 739 740 dp = VTODC(vp); 741 mutex_enter(&dp->dc_lock); 742 dp->dc_mapcnt -= btopr(len); 743 ASSERT(dp->dc_mapcnt >= 0); 744 mutex_exit(&dp->dc_lock); 745 return (0); 746 } 747 748 /* 749 * Constructor/destructor routines for dcnodes 750 */ 751 /*ARGSUSED1*/ 752 static int 753 dcnode_constructor(void *buf, void *cdrarg, int kmflags) 754 { 755 struct dcnode *dp = buf; 756 struct vnode *vp; 757 758 ASSERT(!(kmflags & KM_NOSLEEP)); 759 760 vp = vn_alloc(KM_SLEEP); 761 vp->v_data = (caddr_t)dp; 762 vp->v_type = VREG; 763 vp->v_flag = VNOSWAP; 764 vp->v_vfsp = &dc_vfs; 765 vn_setops(vp, dc_vnodeops); 766 vn_exists(vp); 767 768 dp->dc_vp = vp; 769 mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL); 770 dp->dc_mapcnt = 0; 771 dp->dc_lrunext = dp->dc_lruprev = NULL; 772 return (0); 773 } 774 775 /*ARGSUSED*/ 776 static void 777 dcnode_destructor(void *buf, void *cdrarg) 778 { 779 struct dcnode *dp = buf; 780 struct vnode *vp = DCTOV(dp); 781 782 mutex_destroy(&dp->dc_lock); 783 784 VERIFY(dp->dc_hdr == NULL); 785 VERIFY(dp->dc_subvp == NULL); 786 vn_invalid(vp); 787 vn_free(vp); 788 } 789 790 static struct dcnode * 791 dcnode_alloc(void) 792 { 793 struct dcnode *dp; 794 795 /* 796 * If the free list is above DCLRUSIZE 797 * re-use one from it 798 */ 799 mutex_enter(&dctable_lock); 800 if (dclru_len < DCLRUSIZE) { 801 mutex_exit(&dctable_lock); 802 dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP); 803 } else { 804 ASSERT(dclru != NULL); 805 dp = dclru; 806 dclru_sub(dp); 807 dcdelete(dp); 808 mutex_exit(&dctable_lock); 809 dcnode_recycle(dp); 810 } 811 return (dp); 812 } 813 814 static void 815 dcnode_free(struct dcnode *dp) 816 { 817 struct vnode *vp = DCTOV(dp); 818 819 ASSERT(MUTEX_HELD(&dctable_lock)); 820 821 /* 822 * If no cached pages, no need to put it on lru 823 */ 824 if (!vn_has_cached_data(vp)) { 825 dcdelete(dp); 826 dcnode_recycle(dp); 827 kmem_cache_free(dcnode_cache, dp); 828 return; 829 } 830 831 /* 832 * Add to lru, if it's over the limit, free from head 833 */ 834 dclru_add(dp); 835 if (dclru_len > DCLRUSIZE) { 836 dp = dclru; 837 dclru_sub(dp); 838 dcdelete(dp); 839 dcnode_recycle(dp); 840 kmem_cache_free(dcnode_cache, dp); 841 } 842 } 843 844 static void 845 dcnode_recycle(struct dcnode *dp) 846 { 847 struct vnode *vp; 848 849 vp = DCTOV(dp); 850 851 VN_RELE(dp->dc_subvp); 852 dp->dc_subvp = NULL; 853 (void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL); 854 kmem_free(dp->dc_hdr, dp->dc_hdrsize); 855 dp->dc_hdr = NULL; 856 dp->dc_hdrsize = dp->dc_zmax = 0; 857 dp->dc_bufcache = NULL; 858 dp->dc_mapcnt = 0; 859 vn_reinit(vp); 860 vp->v_type = VREG; 861 vp->v_flag = VNOSWAP; 862 vp->v_vfsp = &dc_vfs; 863 } 864 865 static int 866 dcinit(int fstype, char *name) 867 { 868 static const fs_operation_def_t dc_vfsops_template[] = { 869 NULL, NULL 870 }; 871 int error; 872 major_t dev; 873 874 error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops); 875 if (error) { 876 cmn_err(CE_WARN, "dcinit: bad vfs ops template"); 877 return (error); 878 } 879 VFS_INIT(&dc_vfs, dc_vfsops, NULL); 880 dc_vfs.vfs_flag = VFS_RDONLY; 881 dc_vfs.vfs_fstype = fstype; 882 if ((dev = getudev()) == (major_t)-1) 883 dev = 0; 884 dcdev = makedevice(dev, 0); 885 dc_vfs.vfs_dev = dcdev; 886 887 error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops); 888 if (error != 0) { 889 (void) vfs_freevfsops_by_type(fstype); 890 cmn_err(CE_WARN, "dcinit: bad vnode ops template"); 891 return (error); 892 } 893 894 mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL); 895 mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL); 896 dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode), 897 0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0); 898 899 return (0); 900 } 901 902 /* 903 * Return shadow vnode with the given vp as its subordinate 904 */ 905 struct vnode * 906 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp) 907 { 908 struct dcnode *dp, *ndp; 909 struct comphdr thdr, *hdr; 910 struct kmem_cache **cpp; 911 struct vattr vattr; 912 size_t hdrsize, bsize; 913 int error; 914 915 /* 916 * See if we have an existing shadow 917 * If none, we have to manufacture one 918 */ 919 mutex_enter(&dctable_lock); 920 dp = dcfind(vp); 921 mutex_exit(&dctable_lock); 922 if (dp != NULL) 923 return (DCTOV(dp)); 924 925 /* 926 * Make sure it's a valid compressed file 927 */ 928 hdr = &thdr; 929 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0, 930 UIO_SYSSPACE, 0, 0, cred, NULL); 931 if (error || hdr->ch_magic != CH_MAGIC || 932 hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB || 933 hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE || 934 hdr->ch_blksize > ptob(DCCACHESIZE) || 935 (hdr->ch_blksize & (hdr->ch_blksize - 1)) != 0) 936 return (NULL); 937 938 /* get underlying file size */ 939 if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0) 940 return (NULL); 941 942 /* 943 * Re-read entire header 944 */ 945 hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t); 946 hdr = kmem_alloc(hdrsize, KM_SLEEP); 947 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE, 948 0, 0, cred, NULL); 949 if (error) { 950 kmem_free(hdr, hdrsize); 951 return (NULL); 952 } 953 954 /* 955 * add extra blkmap entry to make dc_getblock()'s 956 * life easier 957 */ 958 bsize = hdr->ch_blksize; 959 hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size; 960 961 ndp = dcnode_alloc(); 962 ndp->dc_subvp = vp; 963 VN_HOLD(vp); 964 ndp->dc_hdr = hdr; 965 ndp->dc_hdrsize = hdrsize; 966 967 /* 968 * Allocate kmem cache if none there already 969 */ 970 ndp->dc_zmax = ZMAXBUF(bsize); 971 cpp = &dcbuf_cache[btop(bsize)]; 972 mutex_enter(&dccache_lock); 973 if (*cpp == NULL) 974 *cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL, 975 NULL, NULL, NULL, NULL, 0); 976 mutex_exit(&dccache_lock); 977 ndp->dc_bufcache = *cpp; 978 979 /* 980 * Recheck table in case someone else created shadow 981 * while we were blocked above. 982 */ 983 mutex_enter(&dctable_lock); 984 dp = dcfind(vp); 985 if (dp != NULL) { 986 mutex_exit(&dctable_lock); 987 dcnode_recycle(ndp); 988 kmem_cache_free(dcnode_cache, ndp); 989 return (DCTOV(dp)); 990 } 991 dcinsert(ndp); 992 mutex_exit(&dctable_lock); 993 994 return (DCTOV(ndp)); 995 } 996 997 998 /* 999 * dcnode lookup table 1000 * These routines maintain a table of dcnodes hashed by their 1001 * subordinate vnode so that they can be found if they already 1002 * exist in the vnode cache 1003 */ 1004 1005 /* 1006 * Put a dcnode in the table. 1007 */ 1008 static void 1009 dcinsert(struct dcnode *newdp) 1010 { 1011 int idx = DCHASH(newdp->dc_subvp); 1012 1013 ASSERT(MUTEX_HELD(&dctable_lock)); 1014 newdp->dc_hash = dctable[idx]; 1015 dctable[idx] = newdp; 1016 } 1017 1018 /* 1019 * Remove a dcnode from the hash table. 1020 */ 1021 void 1022 dcdelete(struct dcnode *deldp) 1023 { 1024 int idx = DCHASH(deldp->dc_subvp); 1025 struct dcnode *dp, *prevdp; 1026 1027 ASSERT(MUTEX_HELD(&dctable_lock)); 1028 dp = dctable[idx]; 1029 if (dp == deldp) 1030 dctable[idx] = dp->dc_hash; 1031 else { 1032 for (prevdp = dp, dp = dp->dc_hash; dp != NULL; 1033 prevdp = dp, dp = dp->dc_hash) { 1034 if (dp == deldp) { 1035 prevdp->dc_hash = dp->dc_hash; 1036 break; 1037 } 1038 } 1039 } 1040 ASSERT(dp != NULL); 1041 } 1042 1043 /* 1044 * Find a shadow vnode in the dctable hash list. 1045 */ 1046 static struct dcnode * 1047 dcfind(struct vnode *vp) 1048 { 1049 struct dcnode *dp; 1050 1051 ASSERT(MUTEX_HELD(&dctable_lock)); 1052 for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash) 1053 if (dp->dc_subvp == vp) { 1054 VN_HOLD(DCTOV(dp)); 1055 if (dp->dc_lrunext) 1056 dclru_sub(dp); 1057 return (dp); 1058 } 1059 return (NULL); 1060 } 1061 1062 #ifdef DEBUG 1063 static int 1064 dclru_count(void) 1065 { 1066 struct dcnode *dp; 1067 int i = 0; 1068 1069 if (dclru == NULL) 1070 return (0); 1071 for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext) 1072 i++; 1073 return (i + 1); 1074 } 1075 #endif 1076 1077 static void 1078 dclru_add(struct dcnode *dp) 1079 { 1080 /* 1081 * Add to dclru as double-link chain 1082 */ 1083 ASSERT(MUTEX_HELD(&dctable_lock)); 1084 if (dclru == NULL) { 1085 dclru = dp; 1086 dp->dc_lruprev = dp->dc_lrunext = dp; 1087 } else { 1088 struct dcnode *last = dclru->dc_lruprev; 1089 1090 dclru->dc_lruprev = dp; 1091 last->dc_lrunext = dp; 1092 dp->dc_lruprev = last; 1093 dp->dc_lrunext = dclru; 1094 } 1095 dclru_len++; 1096 ASSERT(dclru_len == dclru_count()); 1097 } 1098 1099 static void 1100 dclru_sub(struct dcnode *dp) 1101 { 1102 ASSERT(MUTEX_HELD(&dctable_lock)); 1103 dp->dc_lrunext->dc_lruprev = dp->dc_lruprev; 1104 dp->dc_lruprev->dc_lrunext = dp->dc_lrunext; 1105 if (dp == dclru) 1106 dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext; 1107 dp->dc_lrunext = dp->dc_lruprev = NULL; 1108 dclru_len--; 1109 ASSERT(dclru_len == dclru_count()); 1110 } 1111