1 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/thread.h> 41 #include <sys/t_lock.h> 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/bitmap.h> 45 #include <sys/buf.h> 46 #include <sys/cmn_err.h> 47 #include <sys/conf.h> 48 #include <sys/ddi.h> 49 #include <sys/debug.h> 50 #include <sys/errno.h> 51 #include <sys/time.h> 52 #include <sys/fcntl.h> 53 #include <sys/flock.h> 54 #include <sys/file.h> 55 #include <sys/kmem.h> 56 #include <sys/mman.h> 57 #include <sys/vmsystm.h> 58 #include <sys/open.h> 59 #include <sys/swap.h> 60 #include <sys/sysmacros.h> 61 #include <sys/uio.h> 62 #include <sys/vfs.h> 63 #include <sys/vfs_opreg.h> 64 #include <sys/vnode.h> 65 #include <sys/stat.h> 66 #include <sys/poll.h> 67 #include <sys/zmod.h> 68 #include <sys/fs/decomp.h> 69 70 #include <vm/hat.h> 71 #include <vm/as.h> 72 #include <vm/page.h> 73 #include <vm/pvn.h> 74 #include <vm/seg_vn.h> 75 #include <vm/seg_kmem.h> 76 #include <vm/seg_map.h> 77 78 #include <fs/fs_subr.h> 79 80 /* 81 * dcfs - A filesystem for automatic decompressing of fiocompressed files 82 * 83 * This filesystem is a layered filesystem that sits on top of a normal 84 * persistent filesystem and provides automatic decompression of files 85 * that have been previously compressed and stored on the host file system. 86 * This is a pseudo filesystem in that it does not persist data, rather it 87 * intercepts file lookup requests on the host filesystem and provides 88 * transparent decompression of those files. Currently the only supported 89 * host filesystem is ufs. 90 * 91 * A file is compressed via a userland utility (currently cmd/boot/fiocompress) 92 * and marked by fiocompress as a compressed file via a flag in the on-disk 93 * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED 94 * ufs_lookup checks for this flag and if set, passes control to decompvp 95 * a function defined in this (dcfs) filesystem. decomvp uncompresses the file 96 * and returns a dcfs vnode to the VFS layer. 97 * 98 * dcfs is layered on top of ufs and passes requests involving persistence 99 * to the underlying ufs filesystem. The compressed files currently cannot be 100 * written to. 101 */ 102 103 104 /* 105 * Define data structures within this file. 106 */ 107 #define DCSHFT 5 108 #define DCTABLESIZE 16 109 110 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0) 111 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1)) 112 #else 113 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC) 114 #endif 115 116 #define DCLRUSIZE 16 117 118 #define DCCACHESIZE 4 119 120 #define rounddown(x, y) ((x) & ~((y) - 1)) 121 122 struct dcnode *dctable[DCTABLESIZE]; 123 124 struct dcnode *dclru; 125 static int dclru_len; 126 127 kmutex_t dctable_lock; 128 129 dev_t dcdev; 130 struct vfs dc_vfs; 131 132 struct kmem_cache *dcnode_cache; 133 struct kmem_cache *dcbuf_cache[DCCACHESIZE]; 134 135 kmutex_t dccache_lock; 136 137 static int dcinit(int, char *); 138 139 static struct dcnode *dcnode_alloc(void); 140 static void dcnode_free(struct dcnode *); 141 static void dcnode_recycle(struct dcnode *); 142 143 static void dcinsert(struct dcnode *); 144 static void dcdelete(struct dcnode *); 145 static struct dcnode *dcfind(struct vnode *); 146 static void dclru_add(struct dcnode *); 147 static void dclru_sub(struct dcnode *); 148 149 150 /* 151 * This is the loadable module wrapper. 152 */ 153 #include <sys/modctl.h> 154 155 struct vfsops *dc_vfsops; 156 157 static vfsdef_t vfw = { 158 VFSDEF_VERSION, 159 "dcfs", 160 dcinit, 161 VSW_ZMOUNT, 162 NULL 163 }; 164 165 /* 166 * Module linkage information for the kernel. 167 */ 168 extern struct mod_ops mod_fsops; 169 170 static struct modlfs modlfs = { 171 &mod_fsops, "compressed filesystem", &vfw 172 }; 173 174 static struct modlinkage modlinkage = { 175 MODREV_1, (void *)&modlfs, NULL 176 }; 177 178 int 179 _init() 180 { 181 return (mod_install(&modlinkage)); 182 } 183 184 int 185 _info(struct modinfo *modinfop) 186 { 187 return (mod_info(&modlinkage, modinfop)); 188 } 189 190 191 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *); 192 static int dc_close(struct vnode *, int, int, offset_t, 193 struct cred *, caller_context_t *); 194 static int dc_read(struct vnode *, struct uio *, int, struct cred *, 195 struct caller_context *); 196 static int dc_getattr(struct vnode *, struct vattr *, int, 197 struct cred *, caller_context_t *); 198 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *, 199 struct caller_context *); 200 static int dc_access(struct vnode *, int, int, 201 struct cred *, caller_context_t *); 202 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *); 203 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *); 204 static int dc_fid(struct vnode *, struct fid *, caller_context_t *); 205 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *); 206 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 207 struct flk_callback *, struct cred *, caller_context_t *); 208 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *, 209 struct page **, size_t, struct seg *, caddr_t, enum seg_rw, 210 struct cred *, caller_context_t *); 211 static int dc_putpage(struct vnode *, offset_t, size_t, int, 212 struct cred *, caller_context_t *); 213 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 214 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 215 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 216 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 217 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 218 uint_t, uint_t, uint_t, struct cred *, caller_context_t *); 219 220 struct vnodeops *dc_vnodeops; 221 222 const fs_operation_def_t dc_vnodeops_template[] = { 223 VOPNAME_OPEN, { .vop_open = dc_open }, 224 VOPNAME_CLOSE, { .vop_close = dc_close }, 225 VOPNAME_READ, { .vop_read = dc_read }, 226 VOPNAME_GETATTR, { .vop_getattr = dc_getattr }, 227 VOPNAME_SETATTR, { .vop_setattr = dc_setattr }, 228 VOPNAME_ACCESS, { .vop_access = dc_access }, 229 VOPNAME_FSYNC, { .vop_fsync = dc_fsync }, 230 VOPNAME_INACTIVE, { .vop_inactive = dc_inactive }, 231 VOPNAME_FID, { .vop_fid = dc_fid }, 232 VOPNAME_SEEK, { .vop_seek = dc_seek }, 233 VOPNAME_FRLOCK, { .vop_frlock = dc_frlock }, 234 VOPNAME_GETPAGE, { .vop_getpage = dc_getpage }, 235 VOPNAME_PUTPAGE, { .vop_putpage = dc_putpage }, 236 VOPNAME_MAP, { .vop_map = dc_map }, 237 VOPNAME_ADDMAP, { .vop_addmap = dc_addmap }, 238 VOPNAME_DELMAP, { .vop_delmap = dc_delmap }, 239 NULL, NULL 240 }; 241 242 /*ARGSUSED*/ 243 static int 244 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp) 245 { 246 return (0); 247 } 248 249 /*ARGSUSED*/ 250 static int 251 dc_close(struct vnode *vp, int flag, int count, offset_t off, 252 struct cred *cr, caller_context_t *ctp) 253 { 254 (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 255 cleanshares(vp, ttoproc(curthread)->p_pid); 256 return (0); 257 } 258 259 /*ARGSUSED*/ 260 static int 261 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 262 struct caller_context *ct) 263 { 264 struct dcnode *dp = VTODC(vp); 265 size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize); 266 size_t fsize = dp->dc_hdr->ch_fsize; 267 int error; 268 269 /* 270 * Loop through file with segmap, decompression will occur 271 * in dc_getapage 272 */ 273 do { 274 caddr_t base; 275 size_t n; 276 offset_t mapon; 277 278 /* 279 * read to end of block or file 280 */ 281 mapon = uiop->uio_loffset & (rdsize - 1); 282 n = MIN(rdsize - mapon, uiop->uio_resid); 283 n = MIN(n, fsize - uiop->uio_loffset); 284 if (n == 0) 285 return (0); /* at EOF */ 286 287 base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1, 288 S_READ); 289 error = uiomove(base + mapon, n, UIO_READ, uiop); 290 if (!error) { 291 uint_t flags; 292 293 if (n + mapon == rdsize || uiop->uio_loffset == fsize) 294 flags = SM_DONTNEED; 295 else 296 flags = 0; 297 error = segmap_release(segkmap, base, flags); 298 } else 299 (void) segmap_release(segkmap, base, 0); 300 } while (!error && uiop->uio_resid); 301 302 return (error); 303 } 304 305 static int 306 dc_getattr(struct vnode *vp, struct vattr *vap, int flags, 307 cred_t *cred, caller_context_t *ctp) 308 { 309 struct dcnode *dp = VTODC(vp); 310 struct vnode *subvp = dp->dc_subvp; 311 int error; 312 313 error = VOP_GETATTR(subvp, vap, flags, cred, ctp); 314 315 /* substitute uncompressed size */ 316 vap->va_size = dp->dc_hdr->ch_fsize; 317 return (error); 318 } 319 320 static int 321 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred, 322 caller_context_t *ctp) 323 { 324 struct dcnode *dp = VTODC(vp); 325 struct vnode *subvp = dp->dc_subvp; 326 327 return (VOP_SETATTR(subvp, vap, flags, cred, ctp)); 328 } 329 330 static int 331 dc_access(struct vnode *vp, int mode, int flags, 332 cred_t *cred, caller_context_t *ctp) 333 { 334 struct dcnode *dp = VTODC(vp); 335 struct vnode *subvp = dp->dc_subvp; 336 337 return (VOP_ACCESS(subvp, mode, flags, cred, ctp)); 338 } 339 340 /*ARGSUSED*/ 341 static int 342 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp) 343 { 344 return (0); 345 } 346 347 /*ARGSUSED*/ 348 static void 349 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp) 350 { 351 struct dcnode *dp = VTODC(vp); 352 353 mutex_enter(&dctable_lock); 354 mutex_enter(&vp->v_lock); 355 ASSERT(vp->v_count >= 1); 356 if (--vp->v_count != 0) { 357 /* 358 * Somebody accessed the dcnode before we got a chance to 359 * remove it. They will remove it when they do a vn_rele. 360 */ 361 mutex_exit(&vp->v_lock); 362 mutex_exit(&dctable_lock); 363 return; 364 } 365 mutex_exit(&vp->v_lock); 366 367 dcnode_free(dp); 368 369 mutex_exit(&dctable_lock); 370 } 371 372 static int 373 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp) 374 { 375 struct dcnode *dp = VTODC(vp); 376 struct vnode *subvp = dp->dc_subvp; 377 378 return (VOP_FID(subvp, fidp, ctp)); 379 } 380 381 static int 382 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp) 383 { 384 struct dcnode *dp = VTODC(vp); 385 struct vnode *subvp = dp->dc_subvp; 386 387 return (VOP_SEEK(subvp, oof, noffp, ctp)); 388 } 389 390 static int 391 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 392 offset_t offset, struct flk_callback *flk_cbp, 393 cred_t *cr, caller_context_t *ctp) 394 { 395 struct dcnode *dp = VTODC(vp); 396 397 /* 398 * If file is being mapped, disallow frlock. 399 */ 400 if (dp->dc_mapcnt > 0) 401 return (EAGAIN); 402 403 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp)); 404 } 405 406 /*ARGSUSED*/ 407 static int 408 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 409 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 410 { 411 struct dcnode *dp = VTODC(vp); 412 struct comphdr *hdr = dp->dc_hdr; 413 struct page *pp; 414 struct buf *bp; 415 caddr_t saddr; 416 off_t cblkno; 417 size_t rdoff, rdsize, dsize; 418 long xlen; 419 int error, zerr; 420 421 ASSERT(len == hdr->ch_blksize); 422 /* 423 * Get destination pages and make them addressable 424 */ 425 pp = page_create_va(vp, off, len, PG_WAIT, seg, addr); 426 bp = pageio_setup(pp, len, vp, B_READ); 427 bp_mapin(bp); 428 429 /* 430 * read compressed data from subordinate vnode 431 */ 432 saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP); 433 cblkno = off / len; 434 rdoff = hdr->ch_blkmap[cblkno]; 435 rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff; 436 error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff, 437 UIO_SYSSPACE, 0, 0, cr, NULL); 438 if (error) 439 goto cleanup; 440 441 /* 442 * Uncompress 443 */ 444 dsize = len; 445 zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax); 446 if (zerr != Z_OK) { 447 error = EIO; 448 goto cleanup; 449 } 450 451 /* 452 * Handle EOF 453 */ 454 xlen = hdr->ch_fsize - off; 455 if (xlen < len) { 456 bzero(bp->b_un.b_addr + xlen, len - xlen); 457 if (dsize != xlen) 458 error = EIO; 459 } else if (dsize != len) 460 error = EIO; 461 462 /* 463 * Clean up 464 */ 465 cleanup: 466 kmem_cache_free(dp->dc_bufcache, saddr); 467 pageio_done(bp); 468 *ppp = pp; 469 return (error); 470 } 471 472 static int 473 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 474 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 475 { 476 struct page *pp, *plist = NULL; 477 offset_t pgoff; 478 int rdblk; 479 480 /* 481 * pvn_read_kluster() doesn't quite do what we want, since it 482 * thinks sub block reads are ok. Here we always decompress 483 * a full block. 484 */ 485 486 /* 487 * Check page cache 488 */ 489 rdblk = 0; 490 for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) { 491 pp = page_lookup(vp, pgoff, SE_EXCL); 492 if (pp == NULL) { 493 rdblk = 1; 494 break; 495 } 496 page_io_lock(pp); 497 page_add(&plist, pp); 498 plist = plist->p_next; 499 } 500 if (!rdblk) { 501 *ppp = plist; 502 return (0); /* all pages in cache */ 503 } 504 505 /* 506 * Undo any locks so getblock_miss has an open field 507 */ 508 if (plist != NULL) 509 pvn_io_done(plist); 510 511 return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr)); 512 } 513 514 /*ARGSUSED10*/ 515 static int 516 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 517 struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr, 518 enum seg_rw rw, struct cred *cr, caller_context_t *ctp) 519 { 520 struct dcnode *dp = VTODC(vp); 521 struct comphdr *hdr = dp->dc_hdr; 522 struct page *pp, *plist = NULL; 523 caddr_t vp_baddr; 524 offset_t vp_boff, vp_bend; 525 size_t bsize = hdr->ch_blksize; 526 int nblks, error; 527 528 /* does not support write */ 529 if (rw == S_WRITE) { 530 panic("write attempt on compressed file"); 531 /*NOTREACHED*/ 532 } 533 534 if (protp) 535 *protp = PROT_ALL; 536 /* 537 * We don't support asynchronous operation at the moment, so 538 * just pretend we did it. If the pages are ever actually 539 * needed, they'll get brought in then. 540 */ 541 if (pl == NULL) 542 return (0); 543 544 /* 545 * Calc block start and end offsets 546 */ 547 vp_boff = rounddown(off, bsize); 548 vp_bend = roundup(off + len, bsize); 549 vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize); 550 551 nblks = (vp_bend - vp_boff) / bsize; 552 while (nblks--) { 553 error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr, 554 rw, cr); 555 page_list_concat(&plist, &pp); 556 vp_boff += bsize; 557 vp_baddr += bsize; 558 } 559 if (!error) 560 pvn_plist_init(plist, pl, plsz, off, len, rw); 561 else 562 pvn_read_done(plist, B_ERROR); 563 return (error); 564 } 565 566 /* 567 * This function should never be called. We need to have it to pass 568 * it as an argument to other functions. 569 */ 570 /*ARGSUSED*/ 571 static int 572 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp, 573 int flags, struct cred *cr) 574 { 575 /* should never happen */ 576 cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page"); 577 /*NOTREACHED*/ 578 return (0); 579 } 580 581 582 /* 583 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED. 584 * B_INVAL is set by: 585 * 586 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag. 587 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice 588 * which translates to an MC_SYNC with the MS_INVALIDATE flag. 589 * 590 * The B_FREE (as well as the B_DONTNEED) flag is set when the 591 * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked 592 * from SEGVN to release pages behind a pagefault. 593 */ 594 /*ARGSUSED5*/ 595 static int 596 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags, 597 struct cred *cr, caller_context_t *ctp) 598 { 599 int error = 0; 600 601 if (vp->v_count == 0) { 602 panic("dcfs_putpage: bad v_count"); 603 /*NOTREACHED*/ 604 } 605 606 if (vp->v_flag & VNOMAP) 607 return (ENOSYS); 608 609 if (!vn_has_cached_data(vp)) /* no pages mapped */ 610 return (0); 611 612 if (len == 0) /* from 'off' to EOF */ 613 error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr); 614 else { 615 offset_t io_off; 616 se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 617 618 for (io_off = off; io_off < off + len; io_off += PAGESIZE) { 619 page_t *pp; 620 621 /* 622 * We insist on getting the page only if we are 623 * about to invalidate, free or write it and 624 * the B_ASYNC flag is not set. 625 */ 626 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) 627 pp = page_lookup(vp, io_off, se); 628 else 629 pp = page_lookup_nowait(vp, io_off, se); 630 631 if (pp == NULL) 632 continue; 633 /* 634 * Normally pvn_getdirty() should return 0, which 635 * impies that it has done the job for us. 636 * The shouldn't-happen scenario is when it returns 1. 637 * This means that the page has been modified and 638 * needs to be put back. 639 * Since we can't write to a dcfs compressed file, 640 * we fake a failed I/O and force pvn_write_done() 641 * to destroy the page. 642 */ 643 if (pvn_getdirty(pp, flags) == 1) { 644 cmn_err(CE_NOTE, "dc_putpage: dirty page"); 645 pvn_write_done(pp, flags | 646 B_ERROR | B_WRITE | B_INVAL | B_FORCE); 647 } 648 } 649 } 650 return (error); 651 } 652 653 static int 654 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp, 655 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 656 struct cred *cred, caller_context_t *ctp) 657 { 658 struct vattr vattr; 659 struct segvn_crargs vn_a; 660 int error; 661 662 if (vp->v_flag & VNOMAP) 663 return (ENOSYS); 664 665 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) 666 return (ENXIO); 667 668 /* 669 * If file is being locked, disallow mapping. 670 */ 671 if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp)) 672 return (error); 673 if (vn_has_mandatory_locks(vp, vattr.va_mode)) 674 return (EAGAIN); 675 676 as_rangelock(as); 677 678 if ((flags & MAP_FIXED) == 0) { 679 map_addr(addrp, len, off, 1, flags); 680 if (*addrp == NULL) { 681 as_rangeunlock(as); 682 return (ENOMEM); 683 } 684 } else { 685 /* 686 * User specified address - blow away any previous mappings 687 */ 688 (void) as_unmap(as, *addrp, len); 689 } 690 691 vn_a.vp = vp; 692 vn_a.offset = off; 693 vn_a.type = flags & MAP_TYPE; 694 vn_a.prot = prot; 695 vn_a.maxprot = maxprot; 696 vn_a.flags = flags & ~MAP_TYPE; 697 vn_a.cred = cred; 698 vn_a.amp = NULL; 699 vn_a.szc = 0; 700 vn_a.lgrp_mem_policy_flags = 0; 701 702 error = as_map(as, *addrp, len, segvn_create, &vn_a); 703 as_rangeunlock(as); 704 return (error); 705 } 706 707 /*ARGSUSED*/ 708 static int 709 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 710 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 711 struct cred *cr, caller_context_t *ctp) 712 { 713 struct dcnode *dp; 714 715 if (vp->v_flag & VNOMAP) 716 return (ENOSYS); 717 718 dp = VTODC(vp); 719 mutex_enter(&dp->dc_lock); 720 dp->dc_mapcnt += btopr(len); 721 mutex_exit(&dp->dc_lock); 722 return (0); 723 } 724 725 /*ARGSUSED*/ 726 static int 727 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 728 size_t len, uint_t prot, uint_t maxprot, uint_t flags, 729 struct cred *cr, caller_context_t *ctp) 730 { 731 struct dcnode *dp; 732 733 if (vp->v_flag & VNOMAP) 734 return (ENOSYS); 735 736 dp = VTODC(vp); 737 mutex_enter(&dp->dc_lock); 738 dp->dc_mapcnt -= btopr(len); 739 ASSERT(dp->dc_mapcnt >= 0); 740 mutex_exit(&dp->dc_lock); 741 return (0); 742 } 743 744 /* 745 * Constructor/destructor routines for dcnodes 746 */ 747 /*ARGSUSED1*/ 748 static int 749 dcnode_constructor(void *buf, void *cdrarg, int kmflags) 750 { 751 struct dcnode *dp = buf; 752 struct vnode *vp; 753 754 vp = dp->dc_vp = vn_alloc(kmflags); 755 if (vp == NULL) { 756 return (-1); 757 } 758 vp->v_data = dp; 759 vp->v_type = VREG; 760 vp->v_flag = VNOSWAP; 761 vp->v_vfsp = &dc_vfs; 762 vn_setops(vp, dc_vnodeops); 763 vn_exists(vp); 764 765 mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL); 766 dp->dc_mapcnt = 0; 767 dp->dc_lrunext = dp->dc_lruprev = NULL; 768 dp->dc_hdr = NULL; 769 dp->dc_subvp = NULL; 770 return (0); 771 } 772 773 /*ARGSUSED*/ 774 static void 775 dcnode_destructor(void *buf, void *cdrarg) 776 { 777 struct dcnode *dp = buf; 778 struct vnode *vp = DCTOV(dp); 779 780 mutex_destroy(&dp->dc_lock); 781 782 VERIFY(dp->dc_hdr == NULL); 783 VERIFY(dp->dc_subvp == NULL); 784 vn_invalid(vp); 785 vn_free(vp); 786 } 787 788 static struct dcnode * 789 dcnode_alloc(void) 790 { 791 struct dcnode *dp; 792 793 /* 794 * If the free list is above DCLRUSIZE 795 * re-use one from it 796 */ 797 mutex_enter(&dctable_lock); 798 if (dclru_len < DCLRUSIZE) { 799 mutex_exit(&dctable_lock); 800 dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP); 801 } else { 802 ASSERT(dclru != NULL); 803 dp = dclru; 804 dclru_sub(dp); 805 dcdelete(dp); 806 mutex_exit(&dctable_lock); 807 dcnode_recycle(dp); 808 } 809 return (dp); 810 } 811 812 static void 813 dcnode_free(struct dcnode *dp) 814 { 815 struct vnode *vp = DCTOV(dp); 816 817 ASSERT(MUTEX_HELD(&dctable_lock)); 818 819 /* 820 * If no cached pages, no need to put it on lru 821 */ 822 if (!vn_has_cached_data(vp)) { 823 dcdelete(dp); 824 dcnode_recycle(dp); 825 kmem_cache_free(dcnode_cache, dp); 826 return; 827 } 828 829 /* 830 * Add to lru, if it's over the limit, free from head 831 */ 832 dclru_add(dp); 833 if (dclru_len > DCLRUSIZE) { 834 dp = dclru; 835 dclru_sub(dp); 836 dcdelete(dp); 837 dcnode_recycle(dp); 838 kmem_cache_free(dcnode_cache, dp); 839 } 840 } 841 842 static void 843 dcnode_recycle(struct dcnode *dp) 844 { 845 struct vnode *vp; 846 847 vp = DCTOV(dp); 848 849 VN_RELE(dp->dc_subvp); 850 dp->dc_subvp = NULL; 851 (void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL); 852 kmem_free(dp->dc_hdr, dp->dc_hdrsize); 853 dp->dc_hdr = NULL; 854 dp->dc_hdrsize = dp->dc_zmax = 0; 855 dp->dc_bufcache = NULL; 856 dp->dc_mapcnt = 0; 857 vn_reinit(vp); 858 vp->v_type = VREG; 859 vp->v_flag = VNOSWAP; 860 vp->v_vfsp = &dc_vfs; 861 } 862 863 static int 864 dcinit(int fstype, char *name) 865 { 866 static const fs_operation_def_t dc_vfsops_template[] = { 867 NULL, NULL 868 }; 869 int error; 870 major_t dev; 871 872 error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops); 873 if (error) { 874 cmn_err(CE_WARN, "dcinit: bad vfs ops template"); 875 return (error); 876 } 877 VFS_INIT(&dc_vfs, dc_vfsops, NULL); 878 dc_vfs.vfs_flag = VFS_RDONLY; 879 dc_vfs.vfs_fstype = fstype; 880 if ((dev = getudev()) == (major_t)-1) 881 dev = 0; 882 dcdev = makedevice(dev, 0); 883 dc_vfs.vfs_dev = dcdev; 884 885 error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops); 886 if (error != 0) { 887 (void) vfs_freevfsops_by_type(fstype); 888 cmn_err(CE_WARN, "dcinit: bad vnode ops template"); 889 return (error); 890 } 891 892 mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL); 893 mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL); 894 dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode), 895 0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0); 896 897 return (0); 898 } 899 900 /* 901 * Return shadow vnode with the given vp as its subordinate 902 */ 903 struct vnode * 904 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp) 905 { 906 struct dcnode *dp, *ndp; 907 struct comphdr thdr, *hdr; 908 struct kmem_cache **cpp; 909 struct vattr vattr; 910 size_t hdrsize, bsize; 911 int error; 912 913 /* 914 * See if we have an existing shadow 915 * If none, we have to manufacture one 916 */ 917 mutex_enter(&dctable_lock); 918 dp = dcfind(vp); 919 mutex_exit(&dctable_lock); 920 if (dp != NULL) 921 return (DCTOV(dp)); 922 923 /* 924 * Make sure it's a valid compressed file 925 */ 926 hdr = &thdr; 927 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0, 928 UIO_SYSSPACE, 0, 0, cred, NULL); 929 if (error || hdr->ch_magic != CH_MAGIC_ZLIB || 930 hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB || 931 hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE || 932 hdr->ch_blksize > ptob(DCCACHESIZE) || 933 (hdr->ch_blksize & (hdr->ch_blksize - 1)) != 0) 934 return (NULL); 935 936 /* get underlying file size */ 937 if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0) 938 return (NULL); 939 940 /* 941 * Re-read entire header 942 */ 943 hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t); 944 hdr = kmem_alloc(hdrsize, KM_SLEEP); 945 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE, 946 0, 0, cred, NULL); 947 if (error) { 948 kmem_free(hdr, hdrsize); 949 return (NULL); 950 } 951 952 /* 953 * add extra blkmap entry to make dc_getblock()'s 954 * life easier 955 */ 956 bsize = hdr->ch_blksize; 957 hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size; 958 959 ndp = dcnode_alloc(); 960 ndp->dc_subvp = vp; 961 VN_HOLD(vp); 962 ndp->dc_hdr = hdr; 963 ndp->dc_hdrsize = hdrsize; 964 965 /* 966 * Allocate kmem cache if none there already 967 */ 968 ndp->dc_zmax = ZMAXBUF(bsize); 969 cpp = &dcbuf_cache[btop(bsize)]; 970 mutex_enter(&dccache_lock); 971 if (*cpp == NULL) 972 *cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL, 973 NULL, NULL, NULL, NULL, 0); 974 mutex_exit(&dccache_lock); 975 ndp->dc_bufcache = *cpp; 976 977 /* 978 * Recheck table in case someone else created shadow 979 * while we were blocked above. 980 */ 981 mutex_enter(&dctable_lock); 982 dp = dcfind(vp); 983 if (dp != NULL) { 984 mutex_exit(&dctable_lock); 985 dcnode_recycle(ndp); 986 kmem_cache_free(dcnode_cache, ndp); 987 return (DCTOV(dp)); 988 } 989 dcinsert(ndp); 990 mutex_exit(&dctable_lock); 991 992 return (DCTOV(ndp)); 993 } 994 995 996 /* 997 * dcnode lookup table 998 * These routines maintain a table of dcnodes hashed by their 999 * subordinate vnode so that they can be found if they already 1000 * exist in the vnode cache 1001 */ 1002 1003 /* 1004 * Put a dcnode in the table. 1005 */ 1006 static void 1007 dcinsert(struct dcnode *newdp) 1008 { 1009 int idx = DCHASH(newdp->dc_subvp); 1010 1011 ASSERT(MUTEX_HELD(&dctable_lock)); 1012 newdp->dc_hash = dctable[idx]; 1013 dctable[idx] = newdp; 1014 } 1015 1016 /* 1017 * Remove a dcnode from the hash table. 1018 */ 1019 void 1020 dcdelete(struct dcnode *deldp) 1021 { 1022 int idx = DCHASH(deldp->dc_subvp); 1023 struct dcnode *dp, *prevdp; 1024 1025 ASSERT(MUTEX_HELD(&dctable_lock)); 1026 dp = dctable[idx]; 1027 if (dp == deldp) 1028 dctable[idx] = dp->dc_hash; 1029 else { 1030 for (prevdp = dp, dp = dp->dc_hash; dp != NULL; 1031 prevdp = dp, dp = dp->dc_hash) { 1032 if (dp == deldp) { 1033 prevdp->dc_hash = dp->dc_hash; 1034 break; 1035 } 1036 } 1037 } 1038 ASSERT(dp != NULL); 1039 } 1040 1041 /* 1042 * Find a shadow vnode in the dctable hash list. 1043 */ 1044 static struct dcnode * 1045 dcfind(struct vnode *vp) 1046 { 1047 struct dcnode *dp; 1048 1049 ASSERT(MUTEX_HELD(&dctable_lock)); 1050 for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash) 1051 if (dp->dc_subvp == vp) { 1052 VN_HOLD(DCTOV(dp)); 1053 if (dp->dc_lrunext) 1054 dclru_sub(dp); 1055 return (dp); 1056 } 1057 return (NULL); 1058 } 1059 1060 #ifdef DEBUG 1061 static int 1062 dclru_count(void) 1063 { 1064 struct dcnode *dp; 1065 int i = 0; 1066 1067 if (dclru == NULL) 1068 return (0); 1069 for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext) 1070 i++; 1071 return (i + 1); 1072 } 1073 #endif 1074 1075 static void 1076 dclru_add(struct dcnode *dp) 1077 { 1078 /* 1079 * Add to dclru as double-link chain 1080 */ 1081 ASSERT(MUTEX_HELD(&dctable_lock)); 1082 if (dclru == NULL) { 1083 dclru = dp; 1084 dp->dc_lruprev = dp->dc_lrunext = dp; 1085 } else { 1086 struct dcnode *last = dclru->dc_lruprev; 1087 1088 dclru->dc_lruprev = dp; 1089 last->dc_lrunext = dp; 1090 dp->dc_lruprev = last; 1091 dp->dc_lrunext = dclru; 1092 } 1093 dclru_len++; 1094 ASSERT(dclru_len == dclru_count()); 1095 } 1096 1097 static void 1098 dclru_sub(struct dcnode *dp) 1099 { 1100 ASSERT(MUTEX_HELD(&dctable_lock)); 1101 dp->dc_lrunext->dc_lruprev = dp->dc_lruprev; 1102 dp->dc_lruprev->dc_lrunext = dp->dc_lrunext; 1103 if (dp == dclru) 1104 dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext; 1105 dp->dc_lrunext = dp->dc_lruprev = NULL; 1106 dclru_len--; 1107 ASSERT(dclru_len == dclru_count()); 1108 } 1109