xref: /titanic_44/usr/src/uts/common/fs/dcfs/dc_vnops.c (revision 3f7d54a6b84904c8f4d8daa4c7b577bede7df8b9)
1 
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 #include <sys/types.h>
41 #include <sys/thread.h>
42 #include <sys/t_lock.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/bitmap.h>
46 #include <sys/buf.h>
47 #include <sys/cmn_err.h>
48 #include <sys/conf.h>
49 #include <sys/ddi.h>
50 #include <sys/debug.h>
51 #include <sys/errno.h>
52 #include <sys/time.h>
53 #include <sys/fcntl.h>
54 #include <sys/flock.h>
55 #include <sys/file.h>
56 #include <sys/kmem.h>
57 #include <sys/mman.h>
58 #include <sys/vmsystm.h>
59 #include <sys/open.h>
60 #include <sys/swap.h>
61 #include <sys/sysmacros.h>
62 #include <sys/uio.h>
63 #include <sys/vfs.h>
64 #include <sys/vfs_opreg.h>
65 #include <sys/vnode.h>
66 #include <sys/stat.h>
67 #include <sys/poll.h>
68 #include <sys/zmod.h>
69 #include <sys/fs/decomp.h>
70 
71 #include <vm/hat.h>
72 #include <vm/as.h>
73 #include <vm/page.h>
74 #include <vm/pvn.h>
75 #include <vm/seg_vn.h>
76 #include <vm/seg_kmem.h>
77 #include <vm/seg_map.h>
78 
79 #include <fs/fs_subr.h>
80 
81 /*
82  * dcfs - A filesystem for automatic decompressing of fiocompressed files
83  *
84  * This filesystem is a layered filesystem that sits on top of a normal
85  * persistent filesystem and provides automatic decompression of files
86  * that have been previously compressed and stored on the host file system.
87  * This is a pseudo filesystem in that it does not persist data, rather it
88  * intercepts file lookup requests on the host filesystem and provides
89  * transparent decompression of those files. Currently the only supported
90  * host filesystem is ufs.
91  *
92  * A file is compressed via a userland utility (currently cmd/boot/fiocompress)
93  * and marked by fiocompress as a compressed file via a flag in the on-disk
94  * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED
95  * ufs_lookup checks for this flag and if set, passes control to decompvp
96  * a function defined in this (dcfs) filesystem. decomvp uncompresses the file
97  * and returns a dcfs vnode to the VFS layer.
98  *
99  * dcfs is layered on top of ufs and passes requests involving persistence
100  * to the underlying ufs filesystem. The compressed files currently cannot be
101  * written to.
102  */
103 
104 
105 /*
106  * Define data structures within this file.
107  */
108 #define	DCSHFT		5
109 #define	DCTABLESIZE	16
110 
111 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0)
112 #define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1))
113 #else
114 #define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC)
115 #endif
116 
117 #define	DCLRUSIZE	16
118 
119 #define	DCCACHESIZE	4
120 
121 #define	rounddown(x, y)	((x) & ~((y) - 1))
122 
123 struct dcnode	*dctable[DCTABLESIZE];
124 
125 struct dcnode	*dclru;
126 static int	dclru_len;
127 
128 kmutex_t	dctable_lock;
129 
130 dev_t		dcdev;
131 struct vfs	dc_vfs;
132 
133 struct kmem_cache *dcnode_cache;
134 struct kmem_cache *dcbuf_cache[DCCACHESIZE];
135 
136 kmutex_t	dccache_lock;
137 
138 static int dcinit(int, char *);
139 
140 static struct dcnode	*dcnode_alloc(void);
141 static void		dcnode_free(struct dcnode *);
142 static void		dcnode_recycle(struct dcnode *);
143 
144 static void		dcinsert(struct dcnode *);
145 static void		dcdelete(struct dcnode *);
146 static struct dcnode	*dcfind(struct vnode *);
147 static void		dclru_add(struct dcnode *);
148 static void		dclru_sub(struct dcnode *);
149 
150 
151 /*
152  * This is the loadable module wrapper.
153  */
154 #include <sys/modctl.h>
155 
156 struct vfsops *dc_vfsops;
157 
158 static vfsdef_t vfw = {
159 	VFSDEF_VERSION,
160 	"dcfs",
161 	dcinit,
162 	0,
163 	NULL
164 };
165 
166 /*
167  * Module linkage information for the kernel.
168  */
169 extern struct mod_ops mod_fsops;
170 
171 static struct modlfs modlfs = {
172 	&mod_fsops, "compressed filesystem", &vfw
173 };
174 
175 static struct modlinkage modlinkage = {
176 	MODREV_1, (void *)&modlfs, NULL
177 };
178 
179 int
180 _init()
181 {
182 	return (mod_install(&modlinkage));
183 }
184 
185 int
186 _info(struct modinfo *modinfop)
187 {
188 	return (mod_info(&modlinkage, modinfop));
189 }
190 
191 
192 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *);
193 static int dc_close(struct vnode *, int, int, offset_t,
194     struct cred *, caller_context_t *);
195 static int dc_read(struct vnode *, struct uio *, int, struct cred *,
196     struct caller_context *);
197 static int dc_getattr(struct vnode *, struct vattr *, int,
198     struct cred *, caller_context_t *);
199 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *,
200     struct caller_context *);
201 static int dc_access(struct vnode *, int, int,
202     struct cred *, caller_context_t *);
203 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *);
204 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *);
205 static int dc_fid(struct vnode *, struct fid *, caller_context_t *);
206 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
207 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
208     struct flk_callback *, struct cred *, caller_context_t *);
209 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *,
210     struct page **, size_t, struct seg *, caddr_t, enum seg_rw,
211     struct cred *, caller_context_t *);
212 static int dc_putpage(struct vnode *, offset_t, size_t, int,
213     struct cred *, caller_context_t *);
214 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
215     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
216 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
217     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
218 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
219     uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
220 
221 struct vnodeops *dc_vnodeops;
222 
223 const fs_operation_def_t dc_vnodeops_template[] = {
224 	VOPNAME_OPEN,			{ .vop_open = dc_open },
225 	VOPNAME_CLOSE,			{ .vop_close = dc_close },
226 	VOPNAME_READ,			{ .vop_read = dc_read },
227 	VOPNAME_GETATTR,		{ .vop_getattr =  dc_getattr },
228 	VOPNAME_SETATTR,		{ .vop_setattr = dc_setattr },
229 	VOPNAME_ACCESS,			{ .vop_access = dc_access },
230 	VOPNAME_FSYNC,			{ .vop_fsync = dc_fsync },
231 	VOPNAME_INACTIVE,		{ .vop_inactive = dc_inactive },
232 	VOPNAME_FID,			{ .vop_fid = dc_fid },
233 	VOPNAME_SEEK,			{ .vop_seek = dc_seek },
234 	VOPNAME_FRLOCK,			{ .vop_frlock = dc_frlock },
235 	VOPNAME_GETPAGE,		{ .vop_getpage = dc_getpage },
236 	VOPNAME_PUTPAGE,		{ .vop_putpage = dc_putpage },
237 	VOPNAME_MAP,			{ .vop_map = dc_map },
238 	VOPNAME_ADDMAP,			{ .vop_addmap = dc_addmap },
239 	VOPNAME_DELMAP,			{ .vop_delmap = dc_delmap },
240 	NULL,				NULL
241 };
242 
243 /*ARGSUSED*/
244 static int
245 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp)
246 {
247 	return (0);
248 }
249 
250 /*ARGSUSED*/
251 static int
252 dc_close(struct vnode *vp, int flag, int count, offset_t off,
253     struct cred *cr, caller_context_t *ctp)
254 {
255 	(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
256 	cleanshares(vp, ttoproc(curthread)->p_pid);
257 	return (0);
258 }
259 
260 /*ARGSUSED*/
261 static int
262 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
263 	struct caller_context *ct)
264 {
265 	struct dcnode *dp = VTODC(vp);
266 	size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize);
267 	size_t fsize = dp->dc_hdr->ch_fsize;
268 	int error;
269 
270 	/*
271 	 * Loop through file with segmap, decompression will occur
272 	 * in dc_getapage
273 	 */
274 	do {
275 		caddr_t base;
276 		size_t n;
277 		offset_t mapon;
278 
279 		/*
280 		 * read to end of block or file
281 		 */
282 		mapon = uiop->uio_loffset & (rdsize - 1);
283 		n = MIN(rdsize - mapon, uiop->uio_resid);
284 		n = MIN(n, fsize - uiop->uio_loffset);
285 		if (n == 0)
286 			return (0);	/* at EOF */
287 
288 		base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1,
289 		    S_READ);
290 		error = uiomove(base + mapon, n, UIO_READ, uiop);
291 		if (!error) {
292 			uint_t flags;
293 
294 			if (n + mapon == rdsize || uiop->uio_loffset == fsize)
295 				flags = SM_DONTNEED;
296 			else
297 				flags = 0;
298 			error = segmap_release(segkmap, base, flags);
299 		} else
300 			(void) segmap_release(segkmap, base, 0);
301 	} while (!error && uiop->uio_resid);
302 
303 	return (error);
304 }
305 
306 static int
307 dc_getattr(struct vnode *vp, struct vattr *vap, int flags,
308     cred_t *cred, caller_context_t *ctp)
309 {
310 	struct dcnode *dp = VTODC(vp);
311 	struct vnode *subvp = dp->dc_subvp;
312 	int error;
313 
314 	error = VOP_GETATTR(subvp, vap, flags, cred, ctp);
315 
316 	/* substitute uncompressed size */
317 	vap->va_size = dp->dc_hdr->ch_fsize;
318 	return (error);
319 }
320 
321 static int
322 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred,
323     caller_context_t *ctp)
324 {
325 	struct dcnode *dp = VTODC(vp);
326 	struct vnode *subvp = dp->dc_subvp;
327 
328 	return (VOP_SETATTR(subvp, vap, flags, cred, ctp));
329 }
330 
331 static int
332 dc_access(struct vnode *vp, int mode, int flags,
333     cred_t *cred, caller_context_t *ctp)
334 {
335 	struct dcnode *dp = VTODC(vp);
336 	struct vnode *subvp = dp->dc_subvp;
337 
338 	return (VOP_ACCESS(subvp, mode, flags, cred, ctp));
339 }
340 
341 /*ARGSUSED*/
342 static int
343 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp)
344 {
345 	return (0);
346 }
347 
348 /*ARGSUSED*/
349 static void
350 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp)
351 {
352 	struct dcnode *dp = VTODC(vp);
353 
354 	mutex_enter(&dctable_lock);
355 	mutex_enter(&vp->v_lock);
356 	ASSERT(vp->v_count >= 1);
357 	if (--vp->v_count != 0) {
358 		/*
359 		 * Somebody accessed the dcnode before we got a chance to
360 		 * remove it.  They will remove it when they do a vn_rele.
361 		 */
362 		mutex_exit(&vp->v_lock);
363 		mutex_exit(&dctable_lock);
364 		return;
365 	}
366 	mutex_exit(&vp->v_lock);
367 
368 	dcnode_free(dp);
369 
370 	mutex_exit(&dctable_lock);
371 }
372 
373 static int
374 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp)
375 {
376 	struct dcnode *dp = VTODC(vp);
377 	struct vnode *subvp = dp->dc_subvp;
378 
379 	return (VOP_FID(subvp, fidp, ctp));
380 }
381 
382 static int
383 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp)
384 {
385 	struct dcnode *dp = VTODC(vp);
386 	struct vnode *subvp = dp->dc_subvp;
387 
388 	return (VOP_SEEK(subvp, oof, noffp, ctp));
389 }
390 
391 static int
392 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
393     offset_t offset, struct flk_callback *flk_cbp,
394     cred_t *cr, caller_context_t *ctp)
395 {
396 	struct dcnode *dp = VTODC(vp);
397 
398 	/*
399 	 * If file is being mapped, disallow frlock.
400 	 */
401 	if (dp->dc_mapcnt > 0)
402 		return (EAGAIN);
403 
404 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp));
405 }
406 
407 /*ARGSUSED*/
408 static int
409 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
410     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
411 {
412 	struct dcnode *dp = VTODC(vp);
413 	struct comphdr *hdr = dp->dc_hdr;
414 	struct page *pp;
415 	struct buf *bp;
416 	caddr_t saddr;
417 	off_t cblkno;
418 	size_t rdoff, rdsize, dsize;
419 	long xlen;
420 	int error, zerr;
421 
422 	ASSERT(len == hdr->ch_blksize);
423 	/*
424 	 * Get destination pages and make them addressable
425 	 */
426 	pp = page_create_va(vp, off, len, PG_WAIT, seg, addr);
427 	bp = pageio_setup(pp, len, vp, B_READ);
428 	bp_mapin(bp);
429 
430 	/*
431 	 * read compressed data from subordinate vnode
432 	 */
433 	saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP);
434 	cblkno = off / len;
435 	rdoff = hdr->ch_blkmap[cblkno];
436 	rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff;
437 	error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff,
438 	    UIO_SYSSPACE, 0, 0, cr, NULL);
439 	if (error)
440 		goto cleanup;
441 
442 	/*
443 	 * Uncompress
444 	 */
445 	dsize = len;
446 	zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax);
447 	if (zerr != Z_OK) {
448 		error = EIO;
449 		goto cleanup;
450 	}
451 
452 	/*
453 	 * Handle EOF
454 	 */
455 	xlen = hdr->ch_fsize - off;
456 	if (xlen < len) {
457 		bzero(bp->b_un.b_addr + xlen, len - xlen);
458 		if (dsize != xlen)
459 			error = EIO;
460 	} else if (dsize != len)
461 		error = EIO;
462 
463 	/*
464 	 * Clean up
465 	 */
466 cleanup:
467 	kmem_cache_free(dp->dc_bufcache, saddr);
468 	pageio_done(bp);
469 	*ppp = pp;
470 	return (error);
471 }
472 
473 static int
474 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
475     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
476 {
477 	struct page *pp, *plist = NULL;
478 	offset_t pgoff;
479 	int rdblk;
480 
481 	/*
482 	 * pvn_read_kluster() doesn't quite do what we want, since it
483 	 * thinks sub block reads are ok.  Here we always decompress
484 	 * a full block.
485 	 */
486 
487 	/*
488 	 * Check page cache
489 	 */
490 	rdblk = 0;
491 	for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) {
492 		pp = page_lookup(vp, pgoff, SE_EXCL);
493 		if (pp == NULL) {
494 			rdblk = 1;
495 			break;
496 		}
497 		page_io_lock(pp);
498 		page_add(&plist, pp);
499 		plist = plist->p_next;
500 	}
501 	if (!rdblk) {
502 		*ppp = plist;
503 		return (0);	/* all pages in cache */
504 	}
505 
506 	/*
507 	 * Undo any locks so getblock_miss has an open field
508 	 */
509 	if (plist != NULL)
510 		pvn_io_done(plist);
511 
512 	return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr));
513 }
514 
515 /*ARGSUSED10*/
516 static int
517 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
518     struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
519     enum seg_rw rw, struct cred *cr, caller_context_t *ctp)
520 {
521 	struct dcnode *dp = VTODC(vp);
522 	struct comphdr *hdr = dp->dc_hdr;
523 	struct page *pp, *plist = NULL;
524 	caddr_t vp_baddr;
525 	offset_t vp_boff, vp_bend;
526 	size_t bsize = hdr->ch_blksize;
527 	int nblks, error;
528 
529 	/* does not support write */
530 	if (rw == S_WRITE) {
531 		panic("write attempt on compressed file");
532 		/*NOTREACHED*/
533 	}
534 
535 	if (protp)
536 		*protp = PROT_ALL;
537 	/*
538 	 * We don't support asynchronous operation at the moment, so
539 	 * just pretend we did it.  If the pages are ever actually
540 	 * needed, they'll get brought in then.
541 	 */
542 	if (pl == NULL)
543 		return (0);
544 
545 	/*
546 	 * Calc block start and end offsets
547 	 */
548 	vp_boff = rounddown(off, bsize);
549 	vp_bend = roundup(off + len, bsize);
550 	vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize);
551 
552 	nblks = (vp_bend - vp_boff) / bsize;
553 	while (nblks--) {
554 		error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr,
555 		    rw, cr);
556 		page_list_concat(&plist, &pp);
557 		vp_boff += bsize;
558 		vp_baddr += bsize;
559 	}
560 	if (!error)
561 		pvn_plist_init(plist, pl, plsz, off, len, rw);
562 	else
563 		pvn_read_done(plist, B_ERROR);
564 	return (error);
565 }
566 
567 /*
568  * This function should never be called. We need to have it to pass
569  * it as an argument to other functions.
570  */
571 /*ARGSUSED*/
572 static int
573 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp,
574     int flags, struct cred *cr)
575 {
576 	/* should never happen */
577 	cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page");
578 	/*NOTREACHED*/
579 	return (0);
580 }
581 
582 
583 /*
584  * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
585  * B_INVAL is set by:
586  *
587  *	1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
588  *	2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
589  *	   which translates to an MC_SYNC with the MS_INVALIDATE flag.
590  *
591  * The B_FREE (as well as the B_DONTNEED) flag is set when the
592  * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
593  * from SEGVN to release pages behind a pagefault.
594  */
595 /*ARGSUSED5*/
596 static int
597 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
598     struct cred *cr, caller_context_t *ctp)
599 {
600 	int error = 0;
601 
602 	if (vp->v_count == 0) {
603 		panic("dcfs_putpage: bad v_count");
604 		/*NOTREACHED*/
605 	}
606 
607 	if (vp->v_flag & VNOMAP)
608 		return (ENOSYS);
609 
610 	if (!vn_has_cached_data(vp))	/* no pages mapped */
611 		return (0);
612 
613 	if (len == 0)		/* from 'off' to EOF */
614 		error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr);
615 	else {
616 		offset_t io_off;
617 		se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
618 
619 		for (io_off = off; io_off < off + len; io_off += PAGESIZE) {
620 			page_t *pp;
621 
622 			/*
623 			 * We insist on getting the page only if we are
624 			 * about to invalidate, free or write it and
625 			 * the B_ASYNC flag is not set.
626 			 */
627 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0))
628 				pp = page_lookup(vp, io_off, se);
629 			else
630 				pp = page_lookup_nowait(vp, io_off, se);
631 
632 			if (pp == NULL)
633 				continue;
634 			/*
635 			 * Normally pvn_getdirty() should return 0, which
636 			 * impies that it has done the job for us.
637 			 * The shouldn't-happen scenario is when it returns 1.
638 			 * This means that the page has been modified and
639 			 * needs to be put back.
640 			 * Since we can't write to a dcfs compressed file,
641 			 * we fake a failed I/O and force pvn_write_done()
642 			 * to destroy the page.
643 			 */
644 			if (pvn_getdirty(pp, flags) == 1) {
645 				cmn_err(CE_NOTE, "dc_putpage: dirty page");
646 				pvn_write_done(pp, flags |
647 				    B_ERROR | B_WRITE | B_INVAL | B_FORCE);
648 			}
649 		}
650 	}
651 	return (error);
652 }
653 
654 static int
655 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
656     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
657     struct cred *cred, caller_context_t *ctp)
658 {
659 	struct vattr vattr;
660 	struct segvn_crargs vn_a;
661 	int error;
662 
663 	if (vp->v_flag & VNOMAP)
664 		return (ENOSYS);
665 
666 	if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0)
667 		return (ENXIO);
668 
669 	/*
670 	 * If file is being locked, disallow mapping.
671 	 */
672 	if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp))
673 		return (error);
674 	if (vn_has_mandatory_locks(vp, vattr.va_mode))
675 		return (EAGAIN);
676 
677 	as_rangelock(as);
678 
679 	if ((flags & MAP_FIXED) == 0) {
680 		map_addr(addrp, len, off, 1, flags);
681 		if (*addrp == NULL) {
682 			as_rangeunlock(as);
683 			return (ENOMEM);
684 		}
685 	} else {
686 		/*
687 		 * User specified address - blow away any previous mappings
688 		 */
689 		(void) as_unmap(as, *addrp, len);
690 	}
691 
692 	vn_a.vp = vp;
693 	vn_a.offset = off;
694 	vn_a.type = flags & MAP_TYPE;
695 	vn_a.prot = prot;
696 	vn_a.maxprot = maxprot;
697 	vn_a.flags = flags & ~MAP_TYPE;
698 	vn_a.cred = cred;
699 	vn_a.amp = NULL;
700 	vn_a.szc = 0;
701 	vn_a.lgrp_mem_policy_flags = 0;
702 
703 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
704 	as_rangeunlock(as);
705 	return (error);
706 }
707 
708 /*ARGSUSED*/
709 static int
710 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
711     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
712     struct cred *cr, caller_context_t *ctp)
713 {
714 	struct dcnode *dp;
715 
716 	if (vp->v_flag & VNOMAP)
717 		return (ENOSYS);
718 
719 	dp = VTODC(vp);
720 	mutex_enter(&dp->dc_lock);
721 	dp->dc_mapcnt += btopr(len);
722 	mutex_exit(&dp->dc_lock);
723 	return (0);
724 }
725 
726 /*ARGSUSED*/
727 static int
728 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
729     size_t len, uint_t prot, uint_t maxprot, uint_t flags,
730     struct cred *cr, caller_context_t *ctp)
731 {
732 	struct dcnode *dp;
733 
734 	if (vp->v_flag & VNOMAP)
735 		return (ENOSYS);
736 
737 	dp = VTODC(vp);
738 	mutex_enter(&dp->dc_lock);
739 	dp->dc_mapcnt -= btopr(len);
740 	ASSERT(dp->dc_mapcnt >= 0);
741 	mutex_exit(&dp->dc_lock);
742 	return (0);
743 }
744 
745 /*
746  * Constructor/destructor routines for dcnodes
747  */
748 /*ARGSUSED1*/
749 static int
750 dcnode_constructor(void *buf, void *cdrarg, int kmflags)
751 {
752 	struct dcnode *dp = buf;
753 	struct vnode *vp;
754 
755 	vp = dp->dc_vp = vn_alloc(kmflags);
756 	if (vp == NULL) {
757 		return (-1);
758 	}
759 	vp->v_data = dp;
760 	vp->v_type = VREG;
761 	vp->v_flag = VNOSWAP;
762 	vp->v_vfsp = &dc_vfs;
763 	vn_setops(vp, dc_vnodeops);
764 	vn_exists(vp);
765 
766 	mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL);
767 	dp->dc_mapcnt = 0;
768 	dp->dc_lrunext = dp->dc_lruprev = NULL;
769 	dp->dc_hdr = NULL;
770 	dp->dc_subvp = NULL;
771 	return (0);
772 }
773 
774 /*ARGSUSED*/
775 static void
776 dcnode_destructor(void *buf, void *cdrarg)
777 {
778 	struct dcnode *dp = buf;
779 	struct vnode *vp = DCTOV(dp);
780 
781 	mutex_destroy(&dp->dc_lock);
782 
783 	VERIFY(dp->dc_hdr == NULL);
784 	VERIFY(dp->dc_subvp == NULL);
785 	vn_invalid(vp);
786 	vn_free(vp);
787 }
788 
789 static struct dcnode *
790 dcnode_alloc(void)
791 {
792 	struct dcnode *dp;
793 
794 	/*
795 	 * If the free list is above DCLRUSIZE
796 	 * re-use one from it
797 	 */
798 	mutex_enter(&dctable_lock);
799 	if (dclru_len < DCLRUSIZE) {
800 		mutex_exit(&dctable_lock);
801 		dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP);
802 	} else {
803 		ASSERT(dclru != NULL);
804 		dp = dclru;
805 		dclru_sub(dp);
806 		dcdelete(dp);
807 		mutex_exit(&dctable_lock);
808 		dcnode_recycle(dp);
809 	}
810 	return (dp);
811 }
812 
813 static void
814 dcnode_free(struct dcnode *dp)
815 {
816 	struct vnode *vp = DCTOV(dp);
817 
818 	ASSERT(MUTEX_HELD(&dctable_lock));
819 
820 	/*
821 	 * If no cached pages, no need to put it on lru
822 	 */
823 	if (!vn_has_cached_data(vp)) {
824 		dcdelete(dp);
825 		dcnode_recycle(dp);
826 		kmem_cache_free(dcnode_cache, dp);
827 		return;
828 	}
829 
830 	/*
831 	 * Add to lru, if it's over the limit, free from head
832 	 */
833 	dclru_add(dp);
834 	if (dclru_len > DCLRUSIZE) {
835 		dp = dclru;
836 		dclru_sub(dp);
837 		dcdelete(dp);
838 		dcnode_recycle(dp);
839 		kmem_cache_free(dcnode_cache, dp);
840 	}
841 }
842 
843 static void
844 dcnode_recycle(struct dcnode *dp)
845 {
846 	struct vnode *vp;
847 
848 	vp = DCTOV(dp);
849 
850 	VN_RELE(dp->dc_subvp);
851 	dp->dc_subvp = NULL;
852 	(void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL);
853 	kmem_free(dp->dc_hdr, dp->dc_hdrsize);
854 	dp->dc_hdr = NULL;
855 	dp->dc_hdrsize = dp->dc_zmax = 0;
856 	dp->dc_bufcache = NULL;
857 	dp->dc_mapcnt = 0;
858 	vn_reinit(vp);
859 	vp->v_type = VREG;
860 	vp->v_flag = VNOSWAP;
861 	vp->v_vfsp = &dc_vfs;
862 }
863 
864 static int
865 dcinit(int fstype, char *name)
866 {
867 	static const fs_operation_def_t dc_vfsops_template[] = {
868 		NULL, NULL
869 	};
870 	int error;
871 	major_t dev;
872 
873 	error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops);
874 	if (error) {
875 		cmn_err(CE_WARN, "dcinit: bad vfs ops template");
876 		return (error);
877 	}
878 	VFS_INIT(&dc_vfs, dc_vfsops, NULL);
879 	dc_vfs.vfs_flag = VFS_RDONLY;
880 	dc_vfs.vfs_fstype = fstype;
881 	if ((dev = getudev()) == (major_t)-1)
882 		dev = 0;
883 	dcdev = makedevice(dev, 0);
884 	dc_vfs.vfs_dev = dcdev;
885 
886 	error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops);
887 	if (error != 0) {
888 		(void) vfs_freevfsops_by_type(fstype);
889 		cmn_err(CE_WARN, "dcinit: bad vnode ops template");
890 		return (error);
891 	}
892 
893 	mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL);
894 	mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL);
895 	dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode),
896 	    0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0);
897 
898 	return (0);
899 }
900 
901 /*
902  * Return shadow vnode with the given vp as its subordinate
903  */
904 struct vnode *
905 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp)
906 {
907 	struct dcnode *dp, *ndp;
908 	struct comphdr thdr, *hdr;
909 	struct kmem_cache **cpp;
910 	struct vattr vattr;
911 	size_t hdrsize, bsize;
912 	int error;
913 
914 	/*
915 	 * See if we have an existing shadow
916 	 * If none, we have to manufacture one
917 	 */
918 	mutex_enter(&dctable_lock);
919 	dp = dcfind(vp);
920 	mutex_exit(&dctable_lock);
921 	if (dp != NULL)
922 		return (DCTOV(dp));
923 
924 	/*
925 	 * Make sure it's a valid compressed file
926 	 */
927 	hdr = &thdr;
928 	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0,
929 	    UIO_SYSSPACE, 0, 0, cred, NULL);
930 	if (error || hdr->ch_magic != CH_MAGIC_ZLIB ||
931 	    hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB ||
932 	    hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE ||
933 	    hdr->ch_blksize > ptob(DCCACHESIZE) ||
934 	    (hdr->ch_blksize & (hdr->ch_blksize - 1)) != 0)
935 		return (NULL);
936 
937 	/* get underlying file size */
938 	if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0)
939 		return (NULL);
940 
941 	/*
942 	 * Re-read entire header
943 	 */
944 	hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t);
945 	hdr = kmem_alloc(hdrsize, KM_SLEEP);
946 	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE,
947 	    0, 0, cred, NULL);
948 	if (error) {
949 		kmem_free(hdr, hdrsize);
950 		return (NULL);
951 	}
952 
953 	/*
954 	 * add extra blkmap entry to make dc_getblock()'s
955 	 * life easier
956 	 */
957 	bsize = hdr->ch_blksize;
958 	hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size;
959 
960 	ndp = dcnode_alloc();
961 	ndp->dc_subvp = vp;
962 	VN_HOLD(vp);
963 	ndp->dc_hdr = hdr;
964 	ndp->dc_hdrsize = hdrsize;
965 
966 	/*
967 	 * Allocate kmem cache if none there already
968 	 */
969 	ndp->dc_zmax = ZMAXBUF(bsize);
970 	cpp = &dcbuf_cache[btop(bsize)];
971 	mutex_enter(&dccache_lock);
972 	if (*cpp == NULL)
973 		*cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL,
974 		    NULL, NULL, NULL, NULL, 0);
975 	mutex_exit(&dccache_lock);
976 	ndp->dc_bufcache = *cpp;
977 
978 	/*
979 	 * Recheck table in case someone else created shadow
980 	 * while we were blocked above.
981 	 */
982 	mutex_enter(&dctable_lock);
983 	dp = dcfind(vp);
984 	if (dp != NULL) {
985 		mutex_exit(&dctable_lock);
986 		dcnode_recycle(ndp);
987 		kmem_cache_free(dcnode_cache, ndp);
988 		return (DCTOV(dp));
989 	}
990 	dcinsert(ndp);
991 	mutex_exit(&dctable_lock);
992 
993 	return (DCTOV(ndp));
994 }
995 
996 
997 /*
998  * dcnode lookup table
999  * These routines maintain a table of dcnodes hashed by their
1000  * subordinate vnode so that they can be found if they already
1001  * exist in the vnode cache
1002  */
1003 
1004 /*
1005  * Put a dcnode in the table.
1006  */
1007 static void
1008 dcinsert(struct dcnode *newdp)
1009 {
1010 	int idx = DCHASH(newdp->dc_subvp);
1011 
1012 	ASSERT(MUTEX_HELD(&dctable_lock));
1013 	newdp->dc_hash = dctable[idx];
1014 	dctable[idx] = newdp;
1015 }
1016 
1017 /*
1018  * Remove a dcnode from the hash table.
1019  */
1020 void
1021 dcdelete(struct dcnode *deldp)
1022 {
1023 	int idx = DCHASH(deldp->dc_subvp);
1024 	struct dcnode *dp, *prevdp;
1025 
1026 	ASSERT(MUTEX_HELD(&dctable_lock));
1027 	dp = dctable[idx];
1028 	if (dp == deldp)
1029 		dctable[idx] = dp->dc_hash;
1030 	else {
1031 		for (prevdp = dp, dp = dp->dc_hash; dp != NULL;
1032 		    prevdp = dp, dp = dp->dc_hash) {
1033 			if (dp == deldp) {
1034 				prevdp->dc_hash = dp->dc_hash;
1035 				break;
1036 			}
1037 		}
1038 	}
1039 	ASSERT(dp != NULL);
1040 }
1041 
1042 /*
1043  * Find a shadow vnode in the dctable hash list.
1044  */
1045 static struct dcnode *
1046 dcfind(struct vnode *vp)
1047 {
1048 	struct dcnode *dp;
1049 
1050 	ASSERT(MUTEX_HELD(&dctable_lock));
1051 	for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash)
1052 		if (dp->dc_subvp == vp) {
1053 			VN_HOLD(DCTOV(dp));
1054 			if (dp->dc_lrunext)
1055 				dclru_sub(dp);
1056 			return (dp);
1057 		}
1058 	return (NULL);
1059 }
1060 
1061 #ifdef	DEBUG
1062 static int
1063 dclru_count(void)
1064 {
1065 	struct dcnode *dp;
1066 	int i = 0;
1067 
1068 	if (dclru == NULL)
1069 		return (0);
1070 	for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext)
1071 		i++;
1072 	return (i + 1);
1073 }
1074 #endif
1075 
1076 static void
1077 dclru_add(struct dcnode *dp)
1078 {
1079 	/*
1080 	 * Add to dclru as double-link chain
1081 	 */
1082 	ASSERT(MUTEX_HELD(&dctable_lock));
1083 	if (dclru == NULL) {
1084 		dclru = dp;
1085 		dp->dc_lruprev = dp->dc_lrunext = dp;
1086 	} else {
1087 		struct dcnode *last = dclru->dc_lruprev;
1088 
1089 		dclru->dc_lruprev = dp;
1090 		last->dc_lrunext = dp;
1091 		dp->dc_lruprev = last;
1092 		dp->dc_lrunext = dclru;
1093 	}
1094 	dclru_len++;
1095 	ASSERT(dclru_len == dclru_count());
1096 }
1097 
1098 static void
1099 dclru_sub(struct dcnode *dp)
1100 {
1101 	ASSERT(MUTEX_HELD(&dctable_lock));
1102 	dp->dc_lrunext->dc_lruprev = dp->dc_lruprev;
1103 	dp->dc_lruprev->dc_lrunext = dp->dc_lrunext;
1104 	if (dp == dclru)
1105 		dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext;
1106 	dp->dc_lrunext = dp->dc_lruprev = NULL;
1107 	dclru_len--;
1108 	ASSERT(dclru_len == dclru_count());
1109 }
1110