xref: /titanic_50/usr/src/uts/common/fs/dcfs/dc_vnops.c (revision c51cb4bc539e1650eb5bb4f805cc779bfce99c06)
1 
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 #include <sys/types.h>
44 #include <sys/thread.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/bitmap.h>
49 #include <sys/buf.h>
50 #include <sys/cmn_err.h>
51 #include <sys/conf.h>
52 #include <sys/ddi.h>
53 #include <sys/debug.h>
54 #include <sys/errno.h>
55 #include <sys/time.h>
56 #include <sys/fcntl.h>
57 #include <sys/flock.h>
58 #include <sys/file.h>
59 #include <sys/kmem.h>
60 #include <sys/mman.h>
61 #include <sys/vmsystm.h>
62 #include <sys/open.h>
63 #include <sys/swap.h>
64 #include <sys/sysmacros.h>
65 #include <sys/uio.h>
66 #include <sys/vfs.h>
67 #include <sys/vfs_opreg.h>
68 #include <sys/vnode.h>
69 #include <sys/stat.h>
70 #include <sys/poll.h>
71 #include <sys/zmod.h>
72 #include <sys/fs/decomp.h>
73 
74 #include <vm/hat.h>
75 #include <vm/as.h>
76 #include <vm/page.h>
77 #include <vm/pvn.h>
78 #include <vm/seg_vn.h>
79 #include <vm/seg_kmem.h>
80 #include <vm/seg_map.h>
81 
82 #include <fs/fs_subr.h>
83 
84 /*
85  * dcfs - A filesystem for automatic decompressing of fiocompressed files
86  *
87  * This filesystem is a layered filesystem that sits on top of a normal
88  * persistent filesystem and provides automatic decompression of files
89  * that have been previously compressed and stored on the host file system.
90  * This is a pseudo filesystem in that it does not persist data, rather it
91  * intercepts file lookup requests on the host filesystem and provides
92  * transparent decompression of those files. Currently the only supported
93  * host filesystem is ufs.
94  *
95  * A file is compressed via a userland utility (currently cmd/boot/fiocompress)
96  * and marked by fiocompress as a compressed file via a flag in the on-disk
97  * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED
98  * ufs_lookup checks for this flag and if set, passes control to decompvp
99  * a function defined in this (dcfs) filesystem. decomvp uncompresses the file
100  * and returns a dcfs vnode to the VFS layer.
101  *
102  * dcfs is layered on top of ufs and passes requests involving persistence
103  * to the underlying ufs filesystem. The compressed files currently cannot be
104  * written to.
105  */
106 
107 
108 /*
109  * Define data structures within this file.
110  */
111 #define	DCSHFT		5
112 #define	DCTABLESIZE	16
113 
114 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0)
115 #define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1))
116 #else
117 #define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC)
118 #endif
119 
120 #define	DCLRUSIZE	16
121 
122 #define	DCCACHESIZE	4
123 
124 #define	rounddown(x, y)	((x) & ~((y) - 1))
125 
126 struct dcnode	*dctable[DCTABLESIZE];
127 
128 struct dcnode	*dclru;
129 static int	dclru_len;
130 
131 kmutex_t	dctable_lock;
132 
133 dev_t		dcdev;
134 struct vfs	dc_vfs;
135 
136 struct kmem_cache *dcnode_cache;
137 struct kmem_cache *dcbuf_cache[DCCACHESIZE];
138 
139 kmutex_t	dccache_lock;
140 
141 static int dcinit(int, char *);
142 
143 static struct dcnode	*dcnode_alloc(void);
144 static void		dcnode_free(struct dcnode *);
145 static void		dcnode_recycle(struct dcnode *);
146 
147 static void		dcinsert(struct dcnode *);
148 static void		dcdelete(struct dcnode *);
149 static struct dcnode	*dcfind(struct vnode *);
150 static void		dclru_add(struct dcnode *);
151 static void		dclru_sub(struct dcnode *);
152 
153 
154 /*
155  * This is the loadable module wrapper.
156  */
157 #include <sys/modctl.h>
158 
159 struct vfsops *dc_vfsops;
160 
161 static vfsdef_t vfw = {
162 	VFSDEF_VERSION,
163 	"dcfs",
164 	dcinit,
165 	0,
166 	NULL
167 };
168 
169 /*
170  * Module linkage information for the kernel.
171  */
172 extern struct mod_ops mod_fsops;
173 
174 static struct modlfs modlfs = {
175 	&mod_fsops, "compressed filesystem", &vfw
176 };
177 
178 static struct modlinkage modlinkage = {
179 	MODREV_1, (void *)&modlfs, NULL
180 };
181 
182 int
183 _init()
184 {
185 	return (mod_install(&modlinkage));
186 }
187 
188 int
189 _info(struct modinfo *modinfop)
190 {
191 	return (mod_info(&modlinkage, modinfop));
192 }
193 
194 
195 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *);
196 static int dc_close(struct vnode *, int, int, offset_t,
197     struct cred *, caller_context_t *);
198 static int dc_read(struct vnode *, struct uio *, int, struct cred *,
199     struct caller_context *);
200 static int dc_getattr(struct vnode *, struct vattr *, int,
201     struct cred *, caller_context_t *);
202 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *,
203     struct caller_context *);
204 static int dc_access(struct vnode *, int, int,
205     struct cred *, caller_context_t *);
206 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *);
207 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *);
208 static int dc_fid(struct vnode *, struct fid *, caller_context_t *);
209 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
210 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
211     struct flk_callback *, struct cred *, caller_context_t *);
212 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *,
213     struct page **, size_t, struct seg *, caddr_t, enum seg_rw,
214     struct cred *, caller_context_t *);
215 static int dc_putpage(struct vnode *, offset_t, size_t, int,
216     struct cred *, caller_context_t *);
217 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
218     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
219 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
220     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
221 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
222     uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
223 
224 struct vnodeops *dc_vnodeops;
225 
226 const fs_operation_def_t dc_vnodeops_template[] = {
227 	VOPNAME_OPEN,			{ .vop_open = dc_open },
228 	VOPNAME_CLOSE,			{ .vop_close = dc_close },
229 	VOPNAME_READ,			{ .vop_read = dc_read },
230 	VOPNAME_GETATTR,		{ .vop_getattr =  dc_getattr },
231 	VOPNAME_SETATTR,		{ .vop_setattr = dc_setattr },
232 	VOPNAME_ACCESS,			{ .vop_access = dc_access },
233 	VOPNAME_FSYNC,			{ .vop_fsync = dc_fsync },
234 	VOPNAME_INACTIVE,		{ .vop_inactive = dc_inactive },
235 	VOPNAME_FID,			{ .vop_fid = dc_fid },
236 	VOPNAME_SEEK,			{ .vop_seek = dc_seek },
237 	VOPNAME_FRLOCK,			{ .vop_frlock = dc_frlock },
238 	VOPNAME_GETPAGE,		{ .vop_getpage = dc_getpage },
239 	VOPNAME_PUTPAGE,		{ .vop_putpage = dc_putpage },
240 	VOPNAME_MAP,			{ .vop_map = dc_map },
241 	VOPNAME_ADDMAP,			{ .vop_addmap = dc_addmap },
242 	VOPNAME_DELMAP,			{ .vop_delmap = dc_delmap },
243 	NULL,				NULL
244 };
245 
246 /*ARGSUSED*/
247 static int
248 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp)
249 {
250 	return (0);
251 }
252 
253 /*ARGSUSED*/
254 static int
255 dc_close(struct vnode *vp, int flag, int count, offset_t off,
256     struct cred *cr, caller_context_t *ctp)
257 {
258 	(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
259 	cleanshares(vp, ttoproc(curthread)->p_pid);
260 	return (0);
261 }
262 
263 /*ARGSUSED*/
264 static int
265 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
266 	struct caller_context *ct)
267 {
268 	struct dcnode *dp = VTODC(vp);
269 	size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize);
270 	size_t fsize = dp->dc_hdr->ch_fsize;
271 	int error;
272 
273 	/*
274 	 * Loop through file with segmap, decompression will occur
275 	 * in dc_getapage
276 	 */
277 	do {
278 		caddr_t base;
279 		size_t n;
280 		offset_t mapon;
281 
282 		/*
283 		 * read to end of block or file
284 		 */
285 		mapon = uiop->uio_loffset & (rdsize - 1);
286 		n = MIN(rdsize - mapon, uiop->uio_resid);
287 		n = MIN(n, fsize - uiop->uio_loffset);
288 		if (n == 0)
289 			return (0);	/* at EOF */
290 
291 		base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1,
292 		    S_READ);
293 		error = uiomove(base + mapon, n, UIO_READ, uiop);
294 		if (!error) {
295 			uint_t flags;
296 
297 			if (n + mapon == rdsize || uiop->uio_loffset == fsize)
298 				flags = SM_DONTNEED;
299 			else
300 				flags = 0;
301 			error = segmap_release(segkmap, base, flags);
302 		} else
303 			(void) segmap_release(segkmap, base, 0);
304 	} while (!error && uiop->uio_resid);
305 
306 	return (error);
307 }
308 
309 static int
310 dc_getattr(struct vnode *vp, struct vattr *vap, int flags,
311     cred_t *cred, caller_context_t *ctp)
312 {
313 	struct dcnode *dp = VTODC(vp);
314 	struct vnode *subvp = dp->dc_subvp;
315 	int error;
316 
317 	error = VOP_GETATTR(subvp, vap, flags, cred, ctp);
318 
319 	/* substitute uncompressed size */
320 	vap->va_size = dp->dc_hdr->ch_fsize;
321 	return (error);
322 }
323 
324 static int
325 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred,
326     caller_context_t *ctp)
327 {
328 	struct dcnode *dp = VTODC(vp);
329 	struct vnode *subvp = dp->dc_subvp;
330 
331 	return (VOP_SETATTR(subvp, vap, flags, cred, ctp));
332 }
333 
334 static int
335 dc_access(struct vnode *vp, int mode, int flags,
336     cred_t *cred, caller_context_t *ctp)
337 {
338 	struct dcnode *dp = VTODC(vp);
339 	struct vnode *subvp = dp->dc_subvp;
340 
341 	return (VOP_ACCESS(subvp, mode, flags, cred, ctp));
342 }
343 
344 /*ARGSUSED*/
345 static int
346 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp)
347 {
348 	return (0);
349 }
350 
351 /*ARGSUSED*/
352 static void
353 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp)
354 {
355 	struct dcnode *dp = VTODC(vp);
356 
357 	mutex_enter(&dctable_lock);
358 	mutex_enter(&vp->v_lock);
359 	ASSERT(vp->v_count >= 1);
360 	if (--vp->v_count != 0) {
361 		/*
362 		 * Somebody accessed the dcnode before we got a chance to
363 		 * remove it.  They will remove it when they do a vn_rele.
364 		 */
365 		mutex_exit(&vp->v_lock);
366 		mutex_exit(&dctable_lock);
367 		return;
368 	}
369 	mutex_exit(&vp->v_lock);
370 
371 	dcnode_free(dp);
372 
373 	mutex_exit(&dctable_lock);
374 }
375 
376 static int
377 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp)
378 {
379 	struct dcnode *dp = VTODC(vp);
380 	struct vnode *subvp = dp->dc_subvp;
381 
382 	return (VOP_FID(subvp, fidp, ctp));
383 }
384 
385 static int
386 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp)
387 {
388 	struct dcnode *dp = VTODC(vp);
389 	struct vnode *subvp = dp->dc_subvp;
390 
391 	return (VOP_SEEK(subvp, oof, noffp, ctp));
392 }
393 
394 static int
395 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
396     offset_t offset, struct flk_callback *flk_cbp,
397     cred_t *cr, caller_context_t *ctp)
398 {
399 	struct dcnode *dp = VTODC(vp);
400 
401 	/*
402 	 * If file is being mapped, disallow frlock.
403 	 */
404 	if (dp->dc_mapcnt > 0)
405 		return (EAGAIN);
406 
407 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp));
408 }
409 
410 /*ARGSUSED*/
411 static int
412 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
413     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
414 {
415 	struct dcnode *dp = VTODC(vp);
416 	struct comphdr *hdr = dp->dc_hdr;
417 	struct page *pp;
418 	struct buf *bp;
419 	caddr_t saddr;
420 	off_t cblkno;
421 	size_t rdoff, rdsize, dsize;
422 	long xlen;
423 	int error, zerr;
424 
425 	ASSERT(len == hdr->ch_blksize);
426 	/*
427 	 * Get destination pages and make them addressable
428 	 */
429 	pp = page_create_va(vp, off, len, PG_WAIT, seg, addr);
430 	bp = pageio_setup(pp, len, vp, B_READ);
431 	bp_mapin(bp);
432 
433 	/*
434 	 * read compressed data from subordinate vnode
435 	 */
436 	saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP);
437 	cblkno = off / len;
438 	rdoff = hdr->ch_blkmap[cblkno];
439 	rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff;
440 	error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff,
441 	    UIO_SYSSPACE, 0, 0, cr, NULL);
442 	if (error)
443 		goto cleanup;
444 
445 	/*
446 	 * Uncompress
447 	 */
448 	dsize = len;
449 	zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax);
450 	if (zerr != Z_OK) {
451 		error = EIO;
452 		goto cleanup;
453 	}
454 
455 	/*
456 	 * Handle EOF
457 	 */
458 	xlen = hdr->ch_fsize - off;
459 	if (xlen < len) {
460 		bzero(bp->b_un.b_addr + xlen, len - xlen);
461 		if (dsize != xlen)
462 			error = EIO;
463 	} else if (dsize != len)
464 		error = EIO;
465 
466 	/*
467 	 * Clean up
468 	 */
469 cleanup:
470 	kmem_cache_free(dp->dc_bufcache, saddr);
471 	pageio_done(bp);
472 	*ppp = pp;
473 	return (error);
474 }
475 
476 static int
477 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
478     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
479 {
480 	struct page *pp, *plist = NULL;
481 	offset_t pgoff;
482 	int rdblk;
483 
484 	/*
485 	 * pvn_read_kluster() doesn't quite do what we want, since it
486 	 * thinks sub block reads are ok.  Here we always decompress
487 	 * a full block.
488 	 */
489 
490 	/*
491 	 * Check page cache
492 	 */
493 	rdblk = 0;
494 	for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) {
495 		pp = page_lookup(vp, pgoff, SE_EXCL);
496 		if (pp == NULL) {
497 			rdblk = 1;
498 			break;
499 		}
500 		page_io_lock(pp);
501 		page_add(&plist, pp);
502 		plist = plist->p_next;
503 	}
504 	if (!rdblk) {
505 		*ppp = plist;
506 		return (0);	/* all pages in cache */
507 	}
508 
509 	/*
510 	 * Undo any locks so getblock_miss has an open field
511 	 */
512 	if (plist != NULL)
513 		pvn_io_done(plist);
514 
515 	return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr));
516 }
517 
518 /*ARGSUSED10*/
519 static int
520 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
521     struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
522     enum seg_rw rw, struct cred *cr, caller_context_t *ctp)
523 {
524 	struct dcnode *dp = VTODC(vp);
525 	struct comphdr *hdr = dp->dc_hdr;
526 	struct page *pp, *plist = NULL;
527 	caddr_t vp_baddr;
528 	offset_t vp_boff, vp_bend;
529 	size_t bsize = hdr->ch_blksize;
530 	int nblks, error;
531 
532 	/* does not support write */
533 	if (rw == S_WRITE) {
534 		panic("write attempt on compressed file");
535 		/*NOTREACHED*/
536 	}
537 
538 	if (protp)
539 		*protp = PROT_ALL;
540 	/*
541 	 * We don't support asynchronous operation at the moment, so
542 	 * just pretend we did it.  If the pages are ever actually
543 	 * needed, they'll get brought in then.
544 	 */
545 	if (pl == NULL)
546 		return (0);
547 
548 	/*
549 	 * Calc block start and end offsets
550 	 */
551 	vp_boff = rounddown(off, bsize);
552 	vp_bend = roundup(off + len, bsize);
553 	vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize);
554 
555 	nblks = (vp_bend - vp_boff) / bsize;
556 	while (nblks--) {
557 		error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr,
558 		    rw, cr);
559 		page_list_concat(&plist, &pp);
560 		vp_boff += bsize;
561 		vp_baddr += bsize;
562 	}
563 	if (!error)
564 		pvn_plist_init(plist, pl, plsz, off, len, rw);
565 	else
566 		pvn_read_done(plist, B_ERROR);
567 	return (error);
568 }
569 
570 /*
571  * This function should never be called. We need to have it to pass
572  * it as an argument to other functions.
573  */
574 /*ARGSUSED*/
575 static int
576 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp,
577     int flags, struct cred *cr)
578 {
579 	/* should never happen */
580 	cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page");
581 	/*NOTREACHED*/
582 	return (0);
583 }
584 
585 
586 /*
587  * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
588  * B_INVAL is set by:
589  *
590  *	1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
591  *	2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
592  *	   which translates to an MC_SYNC with the MS_INVALIDATE flag.
593  *
594  * The B_FREE (as well as the B_DONTNEED) flag is set when the
595  * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
596  * from SEGVN to release pages behind a pagefault.
597  */
598 /*ARGSUSED5*/
599 static int
600 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
601     struct cred *cr, caller_context_t *ctp)
602 {
603 	int error = 0;
604 
605 	if (vp->v_count == 0) {
606 		panic("dcfs_putpage: bad v_count");
607 		/*NOTREACHED*/
608 	}
609 
610 	if (vp->v_flag & VNOMAP)
611 		return (ENOSYS);
612 
613 	if (!vn_has_cached_data(vp))	/* no pages mapped */
614 		return (0);
615 
616 	if (len == 0)		/* from 'off' to EOF */
617 		error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr);
618 	else {
619 		offset_t io_off;
620 		se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
621 
622 		for (io_off = off; io_off < off + len; io_off += PAGESIZE) {
623 			page_t *pp;
624 
625 			/*
626 			 * We insist on getting the page only if we are
627 			 * about to invalidate, free or write it and
628 			 * the B_ASYNC flag is not set.
629 			 */
630 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0))
631 				pp = page_lookup(vp, io_off, se);
632 			else
633 				pp = page_lookup_nowait(vp, io_off, se);
634 
635 			if (pp == NULL)
636 				continue;
637 			/*
638 			 * Normally pvn_getdirty() should return 0, which
639 			 * impies that it has done the job for us.
640 			 * The shouldn't-happen scenario is when it returns 1.
641 			 * This means that the page has been modified and
642 			 * needs to be put back.
643 			 * Since we can't write to a dcfs compressed file,
644 			 * we fake a failed I/O and force pvn_write_done()
645 			 * to destroy the page.
646 			 */
647 			if (pvn_getdirty(pp, flags) == 1) {
648 				cmn_err(CE_NOTE, "dc_putpage: dirty page");
649 				pvn_write_done(pp, flags |
650 				    B_ERROR | B_WRITE | B_INVAL | B_FORCE);
651 			}
652 		}
653 	}
654 	return (error);
655 }
656 
657 static int
658 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
659     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
660     struct cred *cred, caller_context_t *ctp)
661 {
662 	struct vattr vattr;
663 	struct segvn_crargs vn_a;
664 	int error;
665 
666 	if (vp->v_flag & VNOMAP)
667 		return (ENOSYS);
668 
669 	if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0)
670 		return (ENXIO);
671 
672 	/*
673 	 * If file is being locked, disallow mapping.
674 	 */
675 	if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp))
676 		return (error);
677 	if (vn_has_mandatory_locks(vp, vattr.va_mode))
678 		return (EAGAIN);
679 
680 	as_rangelock(as);
681 
682 	if ((flags & MAP_FIXED) == 0) {
683 		map_addr(addrp, len, off, 1, flags);
684 		if (*addrp == NULL) {
685 			as_rangeunlock(as);
686 			return (ENOMEM);
687 		}
688 	} else {
689 		/*
690 		 * User specified address - blow away any previous mappings
691 		 */
692 		(void) as_unmap(as, *addrp, len);
693 	}
694 
695 	vn_a.vp = vp;
696 	vn_a.offset = off;
697 	vn_a.type = flags & MAP_TYPE;
698 	vn_a.prot = prot;
699 	vn_a.maxprot = maxprot;
700 	vn_a.flags = flags & ~MAP_TYPE;
701 	vn_a.cred = cred;
702 	vn_a.amp = NULL;
703 	vn_a.szc = 0;
704 	vn_a.lgrp_mem_policy_flags = 0;
705 
706 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
707 	as_rangeunlock(as);
708 	return (error);
709 }
710 
711 /*ARGSUSED*/
712 static int
713 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
714     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
715     struct cred *cr, caller_context_t *ctp)
716 {
717 	struct dcnode *dp;
718 
719 	if (vp->v_flag & VNOMAP)
720 		return (ENOSYS);
721 
722 	dp = VTODC(vp);
723 	mutex_enter(&dp->dc_lock);
724 	dp->dc_mapcnt += btopr(len);
725 	mutex_exit(&dp->dc_lock);
726 	return (0);
727 }
728 
729 /*ARGSUSED*/
730 static int
731 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
732     size_t len, uint_t prot, uint_t maxprot, uint_t flags,
733     struct cred *cr, caller_context_t *ctp)
734 {
735 	struct dcnode *dp;
736 
737 	if (vp->v_flag & VNOMAP)
738 		return (ENOSYS);
739 
740 	dp = VTODC(vp);
741 	mutex_enter(&dp->dc_lock);
742 	dp->dc_mapcnt -= btopr(len);
743 	ASSERT(dp->dc_mapcnt >= 0);
744 	mutex_exit(&dp->dc_lock);
745 	return (0);
746 }
747 
748 /*
749  * Constructor/destructor routines for dcnodes
750  */
751 /*ARGSUSED1*/
752 static int
753 dcnode_constructor(void *buf, void *cdrarg, int kmflags)
754 {
755 	struct dcnode *dp = buf;
756 	struct vnode *vp;
757 
758 	vp = dp->dc_vp = vn_alloc(kmflags);
759 	if (vp == NULL) {
760 		return (-1);
761 	}
762 	vp->v_data = dp;
763 	vp->v_type = VREG;
764 	vp->v_flag = VNOSWAP;
765 	vp->v_vfsp = &dc_vfs;
766 	vn_setops(vp, dc_vnodeops);
767 	vn_exists(vp);
768 
769 	mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL);
770 	dp->dc_mapcnt = 0;
771 	dp->dc_lrunext = dp->dc_lruprev = NULL;
772 	dp->dc_hdr = NULL;
773 	dp->dc_subvp = NULL;
774 	return (0);
775 }
776 
777 /*ARGSUSED*/
778 static void
779 dcnode_destructor(void *buf, void *cdrarg)
780 {
781 	struct dcnode *dp = buf;
782 	struct vnode *vp = DCTOV(dp);
783 
784 	mutex_destroy(&dp->dc_lock);
785 
786 	VERIFY(dp->dc_hdr == NULL);
787 	VERIFY(dp->dc_subvp == NULL);
788 	vn_invalid(vp);
789 	vn_free(vp);
790 }
791 
792 static struct dcnode *
793 dcnode_alloc(void)
794 {
795 	struct dcnode *dp;
796 
797 	/*
798 	 * If the free list is above DCLRUSIZE
799 	 * re-use one from it
800 	 */
801 	mutex_enter(&dctable_lock);
802 	if (dclru_len < DCLRUSIZE) {
803 		mutex_exit(&dctable_lock);
804 		dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP);
805 	} else {
806 		ASSERT(dclru != NULL);
807 		dp = dclru;
808 		dclru_sub(dp);
809 		dcdelete(dp);
810 		mutex_exit(&dctable_lock);
811 		dcnode_recycle(dp);
812 	}
813 	return (dp);
814 }
815 
816 static void
817 dcnode_free(struct dcnode *dp)
818 {
819 	struct vnode *vp = DCTOV(dp);
820 
821 	ASSERT(MUTEX_HELD(&dctable_lock));
822 
823 	/*
824 	 * If no cached pages, no need to put it on lru
825 	 */
826 	if (!vn_has_cached_data(vp)) {
827 		dcdelete(dp);
828 		dcnode_recycle(dp);
829 		kmem_cache_free(dcnode_cache, dp);
830 		return;
831 	}
832 
833 	/*
834 	 * Add to lru, if it's over the limit, free from head
835 	 */
836 	dclru_add(dp);
837 	if (dclru_len > DCLRUSIZE) {
838 		dp = dclru;
839 		dclru_sub(dp);
840 		dcdelete(dp);
841 		dcnode_recycle(dp);
842 		kmem_cache_free(dcnode_cache, dp);
843 	}
844 }
845 
846 static void
847 dcnode_recycle(struct dcnode *dp)
848 {
849 	struct vnode *vp;
850 
851 	vp = DCTOV(dp);
852 
853 	VN_RELE(dp->dc_subvp);
854 	dp->dc_subvp = NULL;
855 	(void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL);
856 	kmem_free(dp->dc_hdr, dp->dc_hdrsize);
857 	dp->dc_hdr = NULL;
858 	dp->dc_hdrsize = dp->dc_zmax = 0;
859 	dp->dc_bufcache = NULL;
860 	dp->dc_mapcnt = 0;
861 	vn_reinit(vp);
862 	vp->v_type = VREG;
863 	vp->v_flag = VNOSWAP;
864 	vp->v_vfsp = &dc_vfs;
865 }
866 
867 static int
868 dcinit(int fstype, char *name)
869 {
870 	static const fs_operation_def_t dc_vfsops_template[] = {
871 		NULL, NULL
872 	};
873 	int error;
874 	major_t dev;
875 
876 	error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops);
877 	if (error) {
878 		cmn_err(CE_WARN, "dcinit: bad vfs ops template");
879 		return (error);
880 	}
881 	VFS_INIT(&dc_vfs, dc_vfsops, NULL);
882 	dc_vfs.vfs_flag = VFS_RDONLY;
883 	dc_vfs.vfs_fstype = fstype;
884 	if ((dev = getudev()) == (major_t)-1)
885 		dev = 0;
886 	dcdev = makedevice(dev, 0);
887 	dc_vfs.vfs_dev = dcdev;
888 
889 	error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops);
890 	if (error != 0) {
891 		(void) vfs_freevfsops_by_type(fstype);
892 		cmn_err(CE_WARN, "dcinit: bad vnode ops template");
893 		return (error);
894 	}
895 
896 	mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL);
897 	mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL);
898 	dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode),
899 	    0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0);
900 
901 	return (0);
902 }
903 
904 /*
905  * Return shadow vnode with the given vp as its subordinate
906  */
907 struct vnode *
908 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp)
909 {
910 	struct dcnode *dp, *ndp;
911 	struct comphdr thdr, *hdr;
912 	struct kmem_cache **cpp;
913 	struct vattr vattr;
914 	size_t hdrsize, bsize;
915 	int error;
916 
917 	/*
918 	 * See if we have an existing shadow
919 	 * If none, we have to manufacture one
920 	 */
921 	mutex_enter(&dctable_lock);
922 	dp = dcfind(vp);
923 	mutex_exit(&dctable_lock);
924 	if (dp != NULL)
925 		return (DCTOV(dp));
926 
927 	/*
928 	 * Make sure it's a valid compressed file
929 	 */
930 	hdr = &thdr;
931 	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0,
932 	    UIO_SYSSPACE, 0, 0, cred, NULL);
933 	if (error || hdr->ch_magic != CH_MAGIC ||
934 	    hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB ||
935 	    hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE ||
936 	    hdr->ch_blksize > ptob(DCCACHESIZE) ||
937 	    (hdr->ch_blksize & (hdr->ch_blksize - 1)) != 0)
938 		return (NULL);
939 
940 	/* get underlying file size */
941 	if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0)
942 		return (NULL);
943 
944 	/*
945 	 * Re-read entire header
946 	 */
947 	hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t);
948 	hdr = kmem_alloc(hdrsize, KM_SLEEP);
949 	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE,
950 	    0, 0, cred, NULL);
951 	if (error) {
952 		kmem_free(hdr, hdrsize);
953 		return (NULL);
954 	}
955 
956 	/*
957 	 * add extra blkmap entry to make dc_getblock()'s
958 	 * life easier
959 	 */
960 	bsize = hdr->ch_blksize;
961 	hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size;
962 
963 	ndp = dcnode_alloc();
964 	ndp->dc_subvp = vp;
965 	VN_HOLD(vp);
966 	ndp->dc_hdr = hdr;
967 	ndp->dc_hdrsize = hdrsize;
968 
969 	/*
970 	 * Allocate kmem cache if none there already
971 	 */
972 	ndp->dc_zmax = ZMAXBUF(bsize);
973 	cpp = &dcbuf_cache[btop(bsize)];
974 	mutex_enter(&dccache_lock);
975 	if (*cpp == NULL)
976 		*cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL,
977 		    NULL, NULL, NULL, NULL, 0);
978 	mutex_exit(&dccache_lock);
979 	ndp->dc_bufcache = *cpp;
980 
981 	/*
982 	 * Recheck table in case someone else created shadow
983 	 * while we were blocked above.
984 	 */
985 	mutex_enter(&dctable_lock);
986 	dp = dcfind(vp);
987 	if (dp != NULL) {
988 		mutex_exit(&dctable_lock);
989 		dcnode_recycle(ndp);
990 		kmem_cache_free(dcnode_cache, ndp);
991 		return (DCTOV(dp));
992 	}
993 	dcinsert(ndp);
994 	mutex_exit(&dctable_lock);
995 
996 	return (DCTOV(ndp));
997 }
998 
999 
1000 /*
1001  * dcnode lookup table
1002  * These routines maintain a table of dcnodes hashed by their
1003  * subordinate vnode so that they can be found if they already
1004  * exist in the vnode cache
1005  */
1006 
1007 /*
1008  * Put a dcnode in the table.
1009  */
1010 static void
1011 dcinsert(struct dcnode *newdp)
1012 {
1013 	int idx = DCHASH(newdp->dc_subvp);
1014 
1015 	ASSERT(MUTEX_HELD(&dctable_lock));
1016 	newdp->dc_hash = dctable[idx];
1017 	dctable[idx] = newdp;
1018 }
1019 
1020 /*
1021  * Remove a dcnode from the hash table.
1022  */
1023 void
1024 dcdelete(struct dcnode *deldp)
1025 {
1026 	int idx = DCHASH(deldp->dc_subvp);
1027 	struct dcnode *dp, *prevdp;
1028 
1029 	ASSERT(MUTEX_HELD(&dctable_lock));
1030 	dp = dctable[idx];
1031 	if (dp == deldp)
1032 		dctable[idx] = dp->dc_hash;
1033 	else {
1034 		for (prevdp = dp, dp = dp->dc_hash; dp != NULL;
1035 		    prevdp = dp, dp = dp->dc_hash) {
1036 			if (dp == deldp) {
1037 				prevdp->dc_hash = dp->dc_hash;
1038 				break;
1039 			}
1040 		}
1041 	}
1042 	ASSERT(dp != NULL);
1043 }
1044 
1045 /*
1046  * Find a shadow vnode in the dctable hash list.
1047  */
1048 static struct dcnode *
1049 dcfind(struct vnode *vp)
1050 {
1051 	struct dcnode *dp;
1052 
1053 	ASSERT(MUTEX_HELD(&dctable_lock));
1054 	for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash)
1055 		if (dp->dc_subvp == vp) {
1056 			VN_HOLD(DCTOV(dp));
1057 			if (dp->dc_lrunext)
1058 				dclru_sub(dp);
1059 			return (dp);
1060 		}
1061 	return (NULL);
1062 }
1063 
1064 #ifdef	DEBUG
1065 static int
1066 dclru_count(void)
1067 {
1068 	struct dcnode *dp;
1069 	int i = 0;
1070 
1071 	if (dclru == NULL)
1072 		return (0);
1073 	for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext)
1074 		i++;
1075 	return (i + 1);
1076 }
1077 #endif
1078 
1079 static void
1080 dclru_add(struct dcnode *dp)
1081 {
1082 	/*
1083 	 * Add to dclru as double-link chain
1084 	 */
1085 	ASSERT(MUTEX_HELD(&dctable_lock));
1086 	if (dclru == NULL) {
1087 		dclru = dp;
1088 		dp->dc_lruprev = dp->dc_lrunext = dp;
1089 	} else {
1090 		struct dcnode *last = dclru->dc_lruprev;
1091 
1092 		dclru->dc_lruprev = dp;
1093 		last->dc_lrunext = dp;
1094 		dp->dc_lruprev = last;
1095 		dp->dc_lrunext = dclru;
1096 	}
1097 	dclru_len++;
1098 	ASSERT(dclru_len == dclru_count());
1099 }
1100 
1101 static void
1102 dclru_sub(struct dcnode *dp)
1103 {
1104 	ASSERT(MUTEX_HELD(&dctable_lock));
1105 	dp->dc_lrunext->dc_lruprev = dp->dc_lruprev;
1106 	dp->dc_lruprev->dc_lrunext = dp->dc_lrunext;
1107 	if (dp == dclru)
1108 		dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext;
1109 	dp->dc_lrunext = dp->dc_lruprev = NULL;
1110 	dclru_len--;
1111 	ASSERT(dclru_len == dclru_count());
1112 }
1113