1 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/thread.h> 41 #include <sys/t_lock.h> 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/bitmap.h> 45 #include <sys/buf.h> 46 #include <sys/cmn_err.h> 47 #include <sys/conf.h> 48 #include <sys/ddi.h> 49 #include <sys/debug.h> 50 #include <sys/errno.h> 51 #include <sys/time.h> 52 #include <sys/fcntl.h> 53 #include <sys/flock.h> 54 #include <sys/file.h> 55 #include <sys/kmem.h> 56 #include <sys/mman.h> 57 #include <sys/vmsystm.h> 58 #include <sys/open.h> 59 #include <sys/swap.h> 60 #include <sys/sysmacros.h> 61 #include <sys/uio.h> 62 #include <sys/vfs.h> 63 #include <sys/vfs_opreg.h> 64 #include <sys/vnode.h> 65 #include <sys/stat.h> 66 #include <sys/poll.h> 67 #include <sys/zmod.h> 68 #include <sys/fs/decomp.h> 69 70 #include <vm/hat.h> 71 #include <vm/as.h> 72 #include <vm/page.h> 73 #include <vm/pvn.h> 74 #include <vm/seg_vn.h> 75 #include <vm/seg_kmem.h> 76 #include <vm/seg_map.h> 77 78 #include <fs/fs_subr.h> 79 80 /* 81 * dcfs - A filesystem for automatic decompressing of fiocompressed files 82 * 83 * This filesystem is a layered filesystem that sits on top of a normal 84 * persistent filesystem and provides automatic decompression of files 85 * that have been previously compressed and stored on the host file system. 86 * This is a pseudo filesystem in that it does not persist data, rather it 87 * intercepts file lookup requests on the host filesystem and provides 88 * transparent decompression of those files. Currently the only supported 89 * host filesystem is ufs. 90 * 91 * A file is compressed via a userland utility (currently cmd/boot/fiocompress) 92 * and marked by fiocompress as a compressed file via a flag in the on-disk 93 * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED 94 * ufs_lookup checks for this flag and if set, passes control to decompvp 95 * a function defined in this (dcfs) filesystem. decomvp uncompresses the file 96 * and returns a dcfs vnode to the VFS layer. 97 * 98 * dcfs is layered on top of ufs and passes requests involving persistence 99 * to the underlying ufs filesystem. The compressed files currently cannot be 100 * written to. 101 */ 102 103 104 /* 105 * Define data structures within this file. 106 */ 107 #define DCSHFT 5 108 #define DCTABLESIZE 16 109 110 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0) 111 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1)) 112 #else 113 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC) 114 #endif 115 116 #define DCLRUSIZE 16 117 118 #define DCCACHESIZE 4 119 120 #define rounddown(x, y) ((x) & ~((y) - 1)) 121 122 struct dcnode *dctable[DCTABLESIZE]; 123 124 struct dcnode *dclru; 125 static int dclru_len; 126 127 kmutex_t dctable_lock; 128 129 dev_t dcdev; 130 struct vfs dc_vfs; 131 132 struct kmem_cache *dcnode_cache; 133 struct kmem_cache *dcbuf_cache[DCCACHESIZE]; 134 135 kmutex_t dccache_lock; 136 137 static int dcinit(int, char *); 138 139 static struct dcnode *dcnode_alloc(void); 140 static void dcnode_free(struct dcnode *); 141 static void dcnode_recycle(struct dcnode *); 142 143 static void dcinsert(struct dcnode *); 144 static void dcdelete(struct dcnode *); 145 static struct dcnode *dcfind(struct vnode *); 146 static void dclru_add(struct dcnode *); 147 static void dclru_sub(struct dcnode *); 148 149 150 /* 151 * This is the loadable module wrapper. 152 */ 153 #include <sys/modctl.h> 154 155 struct vfsops *dc_vfsops; 156 157 static vfsdef_t vfw = { 158 VFSDEF_VERSION, 159 "dcfs", 160 dcinit, 161 VSW_ZMOUNT, 162 NULL 163 }; 164 165 /* 166 * Module linkage information for the kernel. 167 */ 168 extern struct mod_ops mod_fsops; 169 170 static struct modlfs modlfs = { 171 &mod_fsops, "compressed filesystem", &vfw 172 }; 173 174 static struct modlinkage modlinkage = { 175 MODREV_1, (void *)&modlfs, NULL 176 }; 177 178 int 179 _init() 180 { 181 return (mod_install(&modlinkage)); 182 } 183 184 int 185 _info(struct modinfo *modinfop) 186 { 187 return (mod_info(&modlinkage, modinfop)); 188 } 189 190 191 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *); 192 static int dc_close(struct vnode *, int, int, offset_t, 193 struct cred *, caller_context_t *); 194 static int dc_read(struct vnode *, struct uio *, int, struct cred *, 195 struct caller_context *); 196 static int dc_getattr(struct vnode *, struct vattr *, int, 197 struct cred *, caller_context_t *); 198 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *, 199 struct caller_context *); 200 static int dc_access(struct vnode *, int, int, 201 struct cred *, caller_context_t *); 202 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *); 203 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *); 204 static int dc_fid(struct vnode *, struct fid *, caller_context_t *); 205 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *); 206 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 207 struct flk_callback *, struct cred *, caller_context_t *); 208 static int dc_realvp(struct vnode *, struct vnode **, caller_context_t *); 209 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *, 210 struct page **, size_t, struct seg *, caddr_t, enum seg_rw, 211 struct cred *, caller_context_t *); 212 static int dc_putpage(struct vnode *, offset_t, size_t, int, 213 struct cred *, caller_context_t *); 214 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 215 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 216 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 217 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 218 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 219 uint_t, uint_t, uint_t, struct cred *, caller_context_t *); 220 221 struct vnodeops *dc_vnodeops; 222 223 const fs_operation_def_t dc_vnodeops_template[] = { 224 VOPNAME_OPEN, { .vop_open = dc_open }, 225 VOPNAME_CLOSE, { .vop_close = dc_close }, 226 VOPNAME_READ, { .vop_read = dc_read }, 227 VOPNAME_GETATTR, { .vop_getattr = dc_getattr }, 228 VOPNAME_SETATTR, { .vop_setattr = dc_setattr }, 229 VOPNAME_ACCESS, { .vop_access = dc_access }, 230 VOPNAME_FSYNC, { .vop_fsync = dc_fsync }, 231 VOPNAME_INACTIVE, { .vop_inactive = dc_inactive }, 232 VOPNAME_FID, { .vop_fid = dc_fid }, 233 VOPNAME_SEEK, { .vop_seek = dc_seek }, 234 VOPNAME_FRLOCK, { .vop_frlock = dc_frlock }, 235 VOPNAME_REALVP, { .vop_realvp = dc_realvp }, 236 VOPNAME_GETPAGE, { .vop_getpage = dc_getpage }, 237 VOPNAME_PUTPAGE, { .vop_putpage = dc_putpage }, 238 VOPNAME_MAP, { .vop_map = dc_map }, 239 VOPNAME_ADDMAP, { .vop_addmap = dc_addmap }, 240 VOPNAME_DELMAP, { .vop_delmap = dc_delmap }, 241 NULL, NULL 242 }; 243 244 /*ARGSUSED*/ 245 static int 246 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp) 247 { 248 return (0); 249 } 250 251 /*ARGSUSED*/ 252 static int 253 dc_close(struct vnode *vp, int flag, int count, offset_t off, 254 struct cred *cr, caller_context_t *ctp) 255 { 256 (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 257 cleanshares(vp, ttoproc(curthread)->p_pid); 258 return (0); 259 } 260 261 /*ARGSUSED*/ 262 static int 263 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 264 struct caller_context *ct) 265 { 266 struct dcnode *dp = VTODC(vp); 267 size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize); 268 size_t fsize = dp->dc_hdr->ch_fsize; 269 int error; 270 271 /* 272 * Loop through file with segmap, decompression will occur 273 * in dc_getapage 274 */ 275 do { 276 caddr_t base; 277 size_t n; 278 offset_t mapon; 279 280 /* 281 * read to end of block or file 282 */ 283 mapon = uiop->uio_loffset & (rdsize - 1); 284 n = MIN(rdsize - mapon, uiop->uio_resid); 285 n = MIN(n, fsize - uiop->uio_loffset); 286 if (n == 0) 287 return (0); /* at EOF */ 288 289 base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1, 290 S_READ); 291 error = uiomove(base + mapon, n, UIO_READ, uiop); 292 if (!error) { 293 uint_t flags; 294 295 if (n + mapon == rdsize || uiop->uio_loffset == fsize) 296 flags = SM_DONTNEED; 297 else 298 flags = 0; 299 error = segmap_release(segkmap, base, flags); 300 } else 301 (void) segmap_release(segkmap, base, 0); 302 } while (!error && uiop->uio_resid); 303 304 return (error); 305 } 306 307 static int 308 dc_getattr(struct vnode *vp, struct vattr *vap, int flags, 309 cred_t *cred, caller_context_t *ctp) 310 { 311 struct dcnode *dp = VTODC(vp); 312 struct vnode *subvp = dp->dc_subvp; 313 int error; 314 315 error = VOP_GETATTR(subvp, vap, flags, cred, ctp); 316 317 /* substitute uncompressed size */ 318 vap->va_size = dp->dc_hdr->ch_fsize; 319 return (error); 320 } 321 322 static int 323 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred, 324 caller_context_t *ctp) 325 { 326 struct dcnode *dp = VTODC(vp); 327 struct vnode *subvp = dp->dc_subvp; 328 329 return (VOP_SETATTR(subvp, vap, flags, cred, ctp)); 330 } 331 332 static int 333 dc_access(struct vnode *vp, int mode, int flags, 334 cred_t *cred, caller_context_t *ctp) 335 { 336 struct dcnode *dp = VTODC(vp); 337 struct vnode *subvp = dp->dc_subvp; 338 339 return (VOP_ACCESS(subvp, mode, flags, cred, ctp)); 340 } 341 342 /*ARGSUSED*/ 343 static int 344 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp) 345 { 346 return (0); 347 } 348 349 /*ARGSUSED*/ 350 static void 351 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp) 352 { 353 struct dcnode *dp = VTODC(vp); 354 355 mutex_enter(&dctable_lock); 356 mutex_enter(&vp->v_lock); 357 ASSERT(vp->v_count >= 1); 358 if (--vp->v_count != 0) { 359 /* 360 * Somebody accessed the dcnode before we got a chance to 361 * remove it. They will remove it when they do a vn_rele. 362 */ 363 mutex_exit(&vp->v_lock); 364 mutex_exit(&dctable_lock); 365 return; 366 } 367 mutex_exit(&vp->v_lock); 368 369 dcnode_free(dp); 370 371 mutex_exit(&dctable_lock); 372 } 373 374 static int 375 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp) 376 { 377 struct dcnode *dp = VTODC(vp); 378 struct vnode *subvp = dp->dc_subvp; 379 380 return (VOP_FID(subvp, fidp, ctp)); 381 } 382 383 static int 384 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp) 385 { 386 struct dcnode *dp = VTODC(vp); 387 struct vnode *subvp = dp->dc_subvp; 388 389 return (VOP_SEEK(subvp, oof, noffp, ctp)); 390 } 391 392 static int 393 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, 394 offset_t offset, struct flk_callback *flk_cbp, 395 cred_t *cr, caller_context_t *ctp) 396 { 397 struct dcnode *dp = VTODC(vp); 398 399 /* 400 * If file is being mapped, disallow frlock. 401 */ 402 if (dp->dc_mapcnt > 0) 403 return (EAGAIN); 404 405 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp)); 406 } 407 408 /*ARGSUSED*/ 409 static int 410 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 411 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 412 { 413 struct dcnode *dp = VTODC(vp); 414 struct comphdr *hdr = dp->dc_hdr; 415 struct page *pp; 416 struct buf *bp; 417 caddr_t saddr; 418 off_t cblkno; 419 size_t rdoff, rdsize, dsize; 420 long xlen; 421 int error, zerr; 422 423 ASSERT(len == hdr->ch_blksize); 424 /* 425 * Get destination pages and make them addressable 426 */ 427 pp = page_create_va(vp, off, len, PG_WAIT, seg, addr); 428 bp = pageio_setup(pp, len, vp, B_READ); 429 bp_mapin(bp); 430 431 /* 432 * read compressed data from subordinate vnode 433 */ 434 saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP); 435 cblkno = off / len; 436 rdoff = hdr->ch_blkmap[cblkno]; 437 rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff; 438 error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff, 439 UIO_SYSSPACE, 0, 0, cr, NULL); 440 if (error) 441 goto cleanup; 442 443 /* 444 * Uncompress 445 */ 446 dsize = len; 447 zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax); 448 if (zerr != Z_OK) { 449 error = EIO; 450 goto cleanup; 451 } 452 453 /* 454 * Handle EOF 455 */ 456 xlen = hdr->ch_fsize - off; 457 if (xlen < len) { 458 bzero(bp->b_un.b_addr + xlen, len - xlen); 459 if (dsize != xlen) 460 error = EIO; 461 } else if (dsize != len) 462 error = EIO; 463 464 /* 465 * Clean up 466 */ 467 cleanup: 468 kmem_cache_free(dp->dc_bufcache, saddr); 469 pageio_done(bp); 470 *ppp = pp; 471 return (error); 472 } 473 474 static int 475 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp, 476 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr) 477 { 478 struct page *pp, *plist = NULL; 479 offset_t pgoff; 480 int rdblk; 481 482 /* 483 * pvn_read_kluster() doesn't quite do what we want, since it 484 * thinks sub block reads are ok. Here we always decompress 485 * a full block. 486 */ 487 488 /* 489 * Check page cache 490 */ 491 rdblk = 0; 492 for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) { 493 pp = page_lookup(vp, pgoff, SE_EXCL); 494 if (pp == NULL) { 495 rdblk = 1; 496 break; 497 } 498 page_io_lock(pp); 499 page_add(&plist, pp); 500 plist = plist->p_next; 501 } 502 if (!rdblk) { 503 *ppp = plist; 504 return (0); /* all pages in cache */ 505 } 506 507 /* 508 * Undo any locks so getblock_miss has an open field 509 */ 510 if (plist != NULL) 511 pvn_io_done(plist); 512 513 return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr)); 514 } 515 516 static int 517 dc_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 518 { 519 struct vnode *rvp; 520 521 vp = VTODC(vp)->dc_subvp; 522 if (VOP_REALVP(vp, &rvp, ct) == 0) 523 vp = rvp; 524 *vpp = vp; 525 return (0); 526 } 527 528 /*ARGSUSED10*/ 529 static int 530 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 531 struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr, 532 enum seg_rw rw, struct cred *cr, caller_context_t *ctp) 533 { 534 struct dcnode *dp = VTODC(vp); 535 struct comphdr *hdr = dp->dc_hdr; 536 struct page *pp, *plist = NULL; 537 caddr_t vp_baddr; 538 offset_t vp_boff, vp_bend; 539 size_t bsize = hdr->ch_blksize; 540 int nblks, error; 541 542 /* does not support write */ 543 if (rw == S_WRITE) { 544 panic("write attempt on compressed file"); 545 /*NOTREACHED*/ 546 } 547 548 if (protp) 549 *protp = PROT_ALL; 550 /* 551 * We don't support asynchronous operation at the moment, so 552 * just pretend we did it. If the pages are ever actually 553 * needed, they'll get brought in then. 554 */ 555 if (pl == NULL) 556 return (0); 557 558 /* 559 * Calc block start and end offsets 560 */ 561 vp_boff = rounddown(off, bsize); 562 vp_bend = roundup(off + len, bsize); 563 vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize); 564 565 nblks = (vp_bend - vp_boff) / bsize; 566 while (nblks--) { 567 error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr, 568 rw, cr); 569 page_list_concat(&plist, &pp); 570 vp_boff += bsize; 571 vp_baddr += bsize; 572 } 573 if (!error) 574 pvn_plist_init(plist, pl, plsz, off, len, rw); 575 else 576 pvn_read_done(plist, B_ERROR); 577 return (error); 578 } 579 580 /* 581 * This function should never be called. We need to have it to pass 582 * it as an argument to other functions. 583 */ 584 /*ARGSUSED*/ 585 static int 586 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp, 587 int flags, struct cred *cr) 588 { 589 /* should never happen */ 590 cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page"); 591 /*NOTREACHED*/ 592 return (0); 593 } 594 595 596 /* 597 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED. 598 * B_INVAL is set by: 599 * 600 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag. 601 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice 602 * which translates to an MC_SYNC with the MS_INVALIDATE flag. 603 * 604 * The B_FREE (as well as the B_DONTNEED) flag is set when the 605 * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked 606 * from SEGVN to release pages behind a pagefault. 607 */ 608 /*ARGSUSED5*/ 609 static int 610 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags, 611 struct cred *cr, caller_context_t *ctp) 612 { 613 int error = 0; 614 615 if (vp->v_count == 0) { 616 panic("dcfs_putpage: bad v_count"); 617 /*NOTREACHED*/ 618 } 619 620 if (vp->v_flag & VNOMAP) 621 return (ENOSYS); 622 623 if (!vn_has_cached_data(vp)) /* no pages mapped */ 624 return (0); 625 626 if (len == 0) /* from 'off' to EOF */ 627 error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr); 628 else { 629 offset_t io_off; 630 se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED; 631 632 for (io_off = off; io_off < off + len; io_off += PAGESIZE) { 633 page_t *pp; 634 635 /* 636 * We insist on getting the page only if we are 637 * about to invalidate, free or write it and 638 * the B_ASYNC flag is not set. 639 */ 640 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) 641 pp = page_lookup(vp, io_off, se); 642 else 643 pp = page_lookup_nowait(vp, io_off, se); 644 645 if (pp == NULL) 646 continue; 647 /* 648 * Normally pvn_getdirty() should return 0, which 649 * impies that it has done the job for us. 650 * The shouldn't-happen scenario is when it returns 1. 651 * This means that the page has been modified and 652 * needs to be put back. 653 * Since we can't write to a dcfs compressed file, 654 * we fake a failed I/O and force pvn_write_done() 655 * to destroy the page. 656 */ 657 if (pvn_getdirty(pp, flags) == 1) { 658 cmn_err(CE_NOTE, "dc_putpage: dirty page"); 659 pvn_write_done(pp, flags | 660 B_ERROR | B_WRITE | B_INVAL | B_FORCE); 661 } 662 } 663 } 664 return (error); 665 } 666 667 static int 668 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp, 669 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 670 struct cred *cred, caller_context_t *ctp) 671 { 672 struct vattr vattr; 673 struct segvn_crargs vn_a; 674 int error; 675 676 if (vp->v_flag & VNOMAP) 677 return (ENOSYS); 678 679 if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) 680 return (ENXIO); 681 682 /* 683 * If file is being locked, disallow mapping. 684 */ 685 if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp)) 686 return (error); 687 if (vn_has_mandatory_locks(vp, vattr.va_mode)) 688 return (EAGAIN); 689 690 as_rangelock(as); 691 692 if ((flags & MAP_FIXED) == 0) { 693 map_addr(addrp, len, off, 1, flags); 694 if (*addrp == NULL) { 695 as_rangeunlock(as); 696 return (ENOMEM); 697 } 698 } else { 699 /* 700 * User specified address - blow away any previous mappings 701 */ 702 (void) as_unmap(as, *addrp, len); 703 } 704 705 vn_a.vp = vp; 706 vn_a.offset = off; 707 vn_a.type = flags & MAP_TYPE; 708 vn_a.prot = prot; 709 vn_a.maxprot = maxprot; 710 vn_a.flags = flags & ~MAP_TYPE; 711 vn_a.cred = cred; 712 vn_a.amp = NULL; 713 vn_a.szc = 0; 714 vn_a.lgrp_mem_policy_flags = 0; 715 716 error = as_map(as, *addrp, len, segvn_create, &vn_a); 717 as_rangeunlock(as); 718 return (error); 719 } 720 721 /*ARGSUSED*/ 722 static int 723 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 724 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 725 struct cred *cr, caller_context_t *ctp) 726 { 727 struct dcnode *dp; 728 729 if (vp->v_flag & VNOMAP) 730 return (ENOSYS); 731 732 dp = VTODC(vp); 733 mutex_enter(&dp->dc_lock); 734 dp->dc_mapcnt += btopr(len); 735 mutex_exit(&dp->dc_lock); 736 return (0); 737 } 738 739 /*ARGSUSED*/ 740 static int 741 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, 742 size_t len, uint_t prot, uint_t maxprot, uint_t flags, 743 struct cred *cr, caller_context_t *ctp) 744 { 745 struct dcnode *dp; 746 747 if (vp->v_flag & VNOMAP) 748 return (ENOSYS); 749 750 dp = VTODC(vp); 751 mutex_enter(&dp->dc_lock); 752 dp->dc_mapcnt -= btopr(len); 753 ASSERT(dp->dc_mapcnt >= 0); 754 mutex_exit(&dp->dc_lock); 755 return (0); 756 } 757 758 /* 759 * Constructor/destructor routines for dcnodes 760 */ 761 /*ARGSUSED1*/ 762 static int 763 dcnode_constructor(void *buf, void *cdrarg, int kmflags) 764 { 765 struct dcnode *dp = buf; 766 struct vnode *vp; 767 768 vp = dp->dc_vp = vn_alloc(kmflags); 769 if (vp == NULL) { 770 return (-1); 771 } 772 vp->v_data = dp; 773 vp->v_type = VREG; 774 vp->v_flag = VNOSWAP; 775 vp->v_vfsp = &dc_vfs; 776 vn_setops(vp, dc_vnodeops); 777 vn_exists(vp); 778 779 mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL); 780 dp->dc_mapcnt = 0; 781 dp->dc_lrunext = dp->dc_lruprev = NULL; 782 dp->dc_hdr = NULL; 783 dp->dc_subvp = NULL; 784 return (0); 785 } 786 787 /*ARGSUSED*/ 788 static void 789 dcnode_destructor(void *buf, void *cdrarg) 790 { 791 struct dcnode *dp = buf; 792 struct vnode *vp = DCTOV(dp); 793 794 mutex_destroy(&dp->dc_lock); 795 796 VERIFY(dp->dc_hdr == NULL); 797 VERIFY(dp->dc_subvp == NULL); 798 vn_invalid(vp); 799 vn_free(vp); 800 } 801 802 static struct dcnode * 803 dcnode_alloc(void) 804 { 805 struct dcnode *dp; 806 807 /* 808 * If the free list is above DCLRUSIZE 809 * re-use one from it 810 */ 811 mutex_enter(&dctable_lock); 812 if (dclru_len < DCLRUSIZE) { 813 mutex_exit(&dctable_lock); 814 dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP); 815 } else { 816 ASSERT(dclru != NULL); 817 dp = dclru; 818 dclru_sub(dp); 819 dcdelete(dp); 820 mutex_exit(&dctable_lock); 821 dcnode_recycle(dp); 822 } 823 return (dp); 824 } 825 826 static void 827 dcnode_free(struct dcnode *dp) 828 { 829 struct vnode *vp = DCTOV(dp); 830 831 ASSERT(MUTEX_HELD(&dctable_lock)); 832 833 /* 834 * If no cached pages, no need to put it on lru 835 */ 836 if (!vn_has_cached_data(vp)) { 837 dcdelete(dp); 838 dcnode_recycle(dp); 839 kmem_cache_free(dcnode_cache, dp); 840 return; 841 } 842 843 /* 844 * Add to lru, if it's over the limit, free from head 845 */ 846 dclru_add(dp); 847 if (dclru_len > DCLRUSIZE) { 848 dp = dclru; 849 dclru_sub(dp); 850 dcdelete(dp); 851 dcnode_recycle(dp); 852 kmem_cache_free(dcnode_cache, dp); 853 } 854 } 855 856 static void 857 dcnode_recycle(struct dcnode *dp) 858 { 859 struct vnode *vp; 860 861 vp = DCTOV(dp); 862 863 VN_RELE(dp->dc_subvp); 864 dp->dc_subvp = NULL; 865 (void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL); 866 kmem_free(dp->dc_hdr, dp->dc_hdrsize); 867 dp->dc_hdr = NULL; 868 dp->dc_hdrsize = dp->dc_zmax = 0; 869 dp->dc_bufcache = NULL; 870 dp->dc_mapcnt = 0; 871 vn_reinit(vp); 872 vp->v_type = VREG; 873 vp->v_flag = VNOSWAP; 874 vp->v_vfsp = &dc_vfs; 875 } 876 877 static int 878 dcinit(int fstype, char *name) 879 { 880 static const fs_operation_def_t dc_vfsops_template[] = { 881 NULL, NULL 882 }; 883 int error; 884 major_t dev; 885 886 error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops); 887 if (error) { 888 cmn_err(CE_WARN, "dcinit: bad vfs ops template"); 889 return (error); 890 } 891 VFS_INIT(&dc_vfs, dc_vfsops, NULL); 892 dc_vfs.vfs_flag = VFS_RDONLY; 893 dc_vfs.vfs_fstype = fstype; 894 if ((dev = getudev()) == (major_t)-1) 895 dev = 0; 896 dcdev = makedevice(dev, 0); 897 dc_vfs.vfs_dev = dcdev; 898 899 error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops); 900 if (error != 0) { 901 (void) vfs_freevfsops_by_type(fstype); 902 cmn_err(CE_WARN, "dcinit: bad vnode ops template"); 903 return (error); 904 } 905 906 mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL); 907 mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL); 908 dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode), 909 0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0); 910 911 return (0); 912 } 913 914 /* 915 * Return shadow vnode with the given vp as its subordinate 916 */ 917 struct vnode * 918 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp) 919 { 920 struct dcnode *dp, *ndp; 921 struct comphdr thdr, *hdr; 922 struct kmem_cache **cpp; 923 struct vattr vattr; 924 size_t hdrsize, bsize; 925 int error; 926 927 /* 928 * See if we have an existing shadow 929 * If none, we have to manufacture one 930 */ 931 mutex_enter(&dctable_lock); 932 dp = dcfind(vp); 933 mutex_exit(&dctable_lock); 934 if (dp != NULL) 935 return (DCTOV(dp)); 936 937 /* 938 * Make sure it's a valid compressed file 939 */ 940 hdr = &thdr; 941 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0, 942 UIO_SYSSPACE, 0, 0, cred, NULL); 943 if (error || hdr->ch_magic != CH_MAGIC_ZLIB || 944 hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB || 945 hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE || 946 hdr->ch_blksize > ptob(DCCACHESIZE) || 947 (hdr->ch_blksize & (hdr->ch_blksize - 1)) != 0) 948 return (NULL); 949 950 /* get underlying file size */ 951 if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0) 952 return (NULL); 953 954 /* 955 * Re-read entire header 956 */ 957 hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t); 958 hdr = kmem_alloc(hdrsize, KM_SLEEP); 959 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE, 960 0, 0, cred, NULL); 961 if (error) { 962 kmem_free(hdr, hdrsize); 963 return (NULL); 964 } 965 966 /* 967 * add extra blkmap entry to make dc_getblock()'s 968 * life easier 969 */ 970 bsize = hdr->ch_blksize; 971 hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size; 972 973 ndp = dcnode_alloc(); 974 ndp->dc_subvp = vp; 975 VN_HOLD(vp); 976 ndp->dc_hdr = hdr; 977 ndp->dc_hdrsize = hdrsize; 978 979 /* 980 * Allocate kmem cache if none there already 981 */ 982 ndp->dc_zmax = ZMAXBUF(bsize); 983 cpp = &dcbuf_cache[btop(bsize)]; 984 mutex_enter(&dccache_lock); 985 if (*cpp == NULL) 986 *cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL, 987 NULL, NULL, NULL, NULL, 0); 988 mutex_exit(&dccache_lock); 989 ndp->dc_bufcache = *cpp; 990 991 /* 992 * Recheck table in case someone else created shadow 993 * while we were blocked above. 994 */ 995 mutex_enter(&dctable_lock); 996 dp = dcfind(vp); 997 if (dp != NULL) { 998 mutex_exit(&dctable_lock); 999 dcnode_recycle(ndp); 1000 kmem_cache_free(dcnode_cache, ndp); 1001 return (DCTOV(dp)); 1002 } 1003 dcinsert(ndp); 1004 mutex_exit(&dctable_lock); 1005 1006 return (DCTOV(ndp)); 1007 } 1008 1009 1010 /* 1011 * dcnode lookup table 1012 * These routines maintain a table of dcnodes hashed by their 1013 * subordinate vnode so that they can be found if they already 1014 * exist in the vnode cache 1015 */ 1016 1017 /* 1018 * Put a dcnode in the table. 1019 */ 1020 static void 1021 dcinsert(struct dcnode *newdp) 1022 { 1023 int idx = DCHASH(newdp->dc_subvp); 1024 1025 ASSERT(MUTEX_HELD(&dctable_lock)); 1026 newdp->dc_hash = dctable[idx]; 1027 dctable[idx] = newdp; 1028 } 1029 1030 /* 1031 * Remove a dcnode from the hash table. 1032 */ 1033 void 1034 dcdelete(struct dcnode *deldp) 1035 { 1036 int idx = DCHASH(deldp->dc_subvp); 1037 struct dcnode *dp, *prevdp; 1038 1039 ASSERT(MUTEX_HELD(&dctable_lock)); 1040 dp = dctable[idx]; 1041 if (dp == deldp) 1042 dctable[idx] = dp->dc_hash; 1043 else { 1044 for (prevdp = dp, dp = dp->dc_hash; dp != NULL; 1045 prevdp = dp, dp = dp->dc_hash) { 1046 if (dp == deldp) { 1047 prevdp->dc_hash = dp->dc_hash; 1048 break; 1049 } 1050 } 1051 } 1052 ASSERT(dp != NULL); 1053 } 1054 1055 /* 1056 * Find a shadow vnode in the dctable hash list. 1057 */ 1058 static struct dcnode * 1059 dcfind(struct vnode *vp) 1060 { 1061 struct dcnode *dp; 1062 1063 ASSERT(MUTEX_HELD(&dctable_lock)); 1064 for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash) 1065 if (dp->dc_subvp == vp) { 1066 VN_HOLD(DCTOV(dp)); 1067 if (dp->dc_lrunext) 1068 dclru_sub(dp); 1069 return (dp); 1070 } 1071 return (NULL); 1072 } 1073 1074 #ifdef DEBUG 1075 static int 1076 dclru_count(void) 1077 { 1078 struct dcnode *dp; 1079 int i = 0; 1080 1081 if (dclru == NULL) 1082 return (0); 1083 for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext) 1084 i++; 1085 return (i + 1); 1086 } 1087 #endif 1088 1089 static void 1090 dclru_add(struct dcnode *dp) 1091 { 1092 /* 1093 * Add to dclru as double-link chain 1094 */ 1095 ASSERT(MUTEX_HELD(&dctable_lock)); 1096 if (dclru == NULL) { 1097 dclru = dp; 1098 dp->dc_lruprev = dp->dc_lrunext = dp; 1099 } else { 1100 struct dcnode *last = dclru->dc_lruprev; 1101 1102 dclru->dc_lruprev = dp; 1103 last->dc_lrunext = dp; 1104 dp->dc_lruprev = last; 1105 dp->dc_lrunext = dclru; 1106 } 1107 dclru_len++; 1108 ASSERT(dclru_len == dclru_count()); 1109 } 1110 1111 static void 1112 dclru_sub(struct dcnode *dp) 1113 { 1114 ASSERT(MUTEX_HELD(&dctable_lock)); 1115 dp->dc_lrunext->dc_lruprev = dp->dc_lruprev; 1116 dp->dc_lruprev->dc_lrunext = dp->dc_lrunext; 1117 if (dp == dclru) 1118 dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext; 1119 dp->dc_lrunext = dp->dc_lruprev = NULL; 1120 dclru_len--; 1121 ASSERT(dclru_len == dclru_count()); 1122 } 1123