xref: /illumos-gate/usr/src/uts/common/fs/dcfs/dc_vnops.c (revision 6e0cbcaa0c6f2bc34634a4cc17b099f9ecef03d1)
1 
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/thread.h>
41 #include <sys/t_lock.h>
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/bitmap.h>
45 #include <sys/buf.h>
46 #include <sys/cmn_err.h>
47 #include <sys/conf.h>
48 #include <sys/ddi.h>
49 #include <sys/debug.h>
50 #include <sys/errno.h>
51 #include <sys/time.h>
52 #include <sys/fcntl.h>
53 #include <sys/flock.h>
54 #include <sys/file.h>
55 #include <sys/kmem.h>
56 #include <sys/mman.h>
57 #include <sys/vmsystm.h>
58 #include <sys/open.h>
59 #include <sys/swap.h>
60 #include <sys/sysmacros.h>
61 #include <sys/uio.h>
62 #include <sys/vfs.h>
63 #include <sys/vfs_opreg.h>
64 #include <sys/vnode.h>
65 #include <sys/stat.h>
66 #include <sys/poll.h>
67 #include <sys/zmod.h>
68 #include <sys/fs/decomp.h>
69 
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/page.h>
73 #include <vm/pvn.h>
74 #include <vm/seg_vn.h>
75 #include <vm/seg_kmem.h>
76 #include <vm/seg_map.h>
77 
78 #include <fs/fs_subr.h>
79 
80 /*
81  * dcfs - A filesystem for automatic decompressing of fiocompressed files
82  *
83  * This filesystem is a layered filesystem that sits on top of a normal
84  * persistent filesystem and provides automatic decompression of files
85  * that have been previously compressed and stored on the host file system.
86  * This is a pseudo filesystem in that it does not persist data, rather it
87  * intercepts file lookup requests on the host filesystem and provides
88  * transparent decompression of those files. Currently the only supported
89  * host filesystem is ufs.
90  *
91  * A file is compressed via a userland utility (currently cmd/boot/fiocompress)
92  * and marked by fiocompress as a compressed file via a flag in the on-disk
93  * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED
94  * ufs_lookup checks for this flag and if set, passes control to decompvp
95  * a function defined in this (dcfs) filesystem. decomvp uncompresses the file
96  * and returns a dcfs vnode to the VFS layer.
97  *
98  * dcfs is layered on top of ufs and passes requests involving persistence
99  * to the underlying ufs filesystem. The compressed files currently cannot be
100  * written to.
101  */
102 
103 
104 /*
105  * Define data structures within this file.
106  */
107 #define	DCSHFT		5
108 #define	DCTABLESIZE	16
109 
110 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0)
111 #define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1))
112 #else
113 #define	DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC)
114 #endif
115 
116 #define	DCLRUSIZE	16
117 
118 #define	DCCACHESIZE	4
119 
120 #define	rounddown(x, y)	((x) & ~((y) - 1))
121 
122 struct dcnode	*dctable[DCTABLESIZE];
123 
124 struct dcnode	*dclru;
125 static int	dclru_len;
126 
127 kmutex_t	dctable_lock;
128 
129 dev_t		dcdev;
130 struct vfs	dc_vfs;
131 
132 struct kmem_cache *dcnode_cache;
133 struct kmem_cache *dcbuf_cache[DCCACHESIZE];
134 
135 kmutex_t	dccache_lock;
136 
137 static int dcinit(int, char *);
138 
139 static struct dcnode	*dcnode_alloc(void);
140 static void		dcnode_free(struct dcnode *);
141 static void		dcnode_recycle(struct dcnode *);
142 
143 static void		dcinsert(struct dcnode *);
144 static void		dcdelete(struct dcnode *);
145 static struct dcnode	*dcfind(struct vnode *);
146 static void		dclru_add(struct dcnode *);
147 static void		dclru_sub(struct dcnode *);
148 
149 
150 /*
151  * This is the loadable module wrapper.
152  */
153 #include <sys/modctl.h>
154 
155 struct vfsops *dc_vfsops;
156 
157 static vfsdef_t vfw = {
158 	VFSDEF_VERSION,
159 	"dcfs",
160 	dcinit,
161 	VSW_ZMOUNT,
162 	NULL
163 };
164 
165 /*
166  * Module linkage information for the kernel.
167  */
168 extern struct mod_ops mod_fsops;
169 
170 static struct modlfs modlfs = {
171 	&mod_fsops, "compressed filesystem", &vfw
172 };
173 
174 static struct modlinkage modlinkage = {
175 	MODREV_1, (void *)&modlfs, NULL
176 };
177 
178 int
179 _init()
180 {
181 	return (mod_install(&modlinkage));
182 }
183 
184 int
185 _info(struct modinfo *modinfop)
186 {
187 	return (mod_info(&modlinkage, modinfop));
188 }
189 
190 
191 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *);
192 static int dc_close(struct vnode *, int, int, offset_t,
193     struct cred *, caller_context_t *);
194 static int dc_read(struct vnode *, struct uio *, int, struct cred *,
195     struct caller_context *);
196 static int dc_getattr(struct vnode *, struct vattr *, int,
197     struct cred *, caller_context_t *);
198 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *,
199     struct caller_context *);
200 static int dc_access(struct vnode *, int, int,
201     struct cred *, caller_context_t *);
202 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *);
203 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *);
204 static int dc_fid(struct vnode *, struct fid *, caller_context_t *);
205 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
206 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
207     struct flk_callback *, struct cred *, caller_context_t *);
208 static int dc_realvp(struct vnode *, struct vnode **, caller_context_t *);
209 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *,
210     struct page **, size_t, struct seg *, caddr_t, enum seg_rw,
211     struct cred *, caller_context_t *);
212 static int dc_putpage(struct vnode *, offset_t, size_t, int,
213     struct cred *, caller_context_t *);
214 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
215     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
216 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
217     uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
218 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
219     uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
220 
221 struct vnodeops *dc_vnodeops;
222 
223 const fs_operation_def_t dc_vnodeops_template[] = {
224 	VOPNAME_OPEN,			{ .vop_open = dc_open },
225 	VOPNAME_CLOSE,			{ .vop_close = dc_close },
226 	VOPNAME_READ,			{ .vop_read = dc_read },
227 	VOPNAME_GETATTR,		{ .vop_getattr =  dc_getattr },
228 	VOPNAME_SETATTR,		{ .vop_setattr = dc_setattr },
229 	VOPNAME_ACCESS,			{ .vop_access = dc_access },
230 	VOPNAME_FSYNC,			{ .vop_fsync = dc_fsync },
231 	VOPNAME_INACTIVE,		{ .vop_inactive = dc_inactive },
232 	VOPNAME_FID,			{ .vop_fid = dc_fid },
233 	VOPNAME_SEEK,			{ .vop_seek = dc_seek },
234 	VOPNAME_FRLOCK,			{ .vop_frlock = dc_frlock },
235 	VOPNAME_REALVP,			{ .vop_realvp = dc_realvp },
236 	VOPNAME_GETPAGE,		{ .vop_getpage = dc_getpage },
237 	VOPNAME_PUTPAGE,		{ .vop_putpage = dc_putpage },
238 	VOPNAME_MAP,			{ .vop_map = dc_map },
239 	VOPNAME_ADDMAP,			{ .vop_addmap = dc_addmap },
240 	VOPNAME_DELMAP,			{ .vop_delmap = dc_delmap },
241 	NULL,				NULL
242 };
243 
244 /*ARGSUSED*/
245 static int
246 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp)
247 {
248 	return (0);
249 }
250 
251 /*ARGSUSED*/
252 static int
253 dc_close(struct vnode *vp, int flag, int count, offset_t off,
254     struct cred *cr, caller_context_t *ctp)
255 {
256 	(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
257 	cleanshares(vp, ttoproc(curthread)->p_pid);
258 	return (0);
259 }
260 
261 /*ARGSUSED*/
262 static int
263 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
264 	struct caller_context *ct)
265 {
266 	struct dcnode *dp = VTODC(vp);
267 	size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize);
268 	size_t fsize = dp->dc_hdr->ch_fsize;
269 	int error;
270 
271 	/*
272 	 * Loop through file with segmap, decompression will occur
273 	 * in dc_getapage
274 	 */
275 	do {
276 		caddr_t base;
277 		size_t n;
278 		offset_t mapon;
279 
280 		/*
281 		 * read to end of block or file
282 		 */
283 		mapon = uiop->uio_loffset & (rdsize - 1);
284 		n = MIN(rdsize - mapon, uiop->uio_resid);
285 		n = MIN(n, fsize - uiop->uio_loffset);
286 		if (n == 0)
287 			return (0);	/* at EOF */
288 
289 		base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1,
290 		    S_READ);
291 		error = uiomove(base + mapon, n, UIO_READ, uiop);
292 		if (!error) {
293 			uint_t flags;
294 
295 			if (n + mapon == rdsize || uiop->uio_loffset == fsize)
296 				flags = SM_DONTNEED;
297 			else
298 				flags = 0;
299 			error = segmap_release(segkmap, base, flags);
300 		} else
301 			(void) segmap_release(segkmap, base, 0);
302 	} while (!error && uiop->uio_resid);
303 
304 	return (error);
305 }
306 
307 static int
308 dc_getattr(struct vnode *vp, struct vattr *vap, int flags,
309     cred_t *cred, caller_context_t *ctp)
310 {
311 	struct dcnode *dp = VTODC(vp);
312 	struct vnode *subvp = dp->dc_subvp;
313 	int error;
314 
315 	error = VOP_GETATTR(subvp, vap, flags, cred, ctp);
316 
317 	/* substitute uncompressed size */
318 	vap->va_size = dp->dc_hdr->ch_fsize;
319 	return (error);
320 }
321 
322 static int
323 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred,
324     caller_context_t *ctp)
325 {
326 	struct dcnode *dp = VTODC(vp);
327 	struct vnode *subvp = dp->dc_subvp;
328 
329 	return (VOP_SETATTR(subvp, vap, flags, cred, ctp));
330 }
331 
332 static int
333 dc_access(struct vnode *vp, int mode, int flags,
334     cred_t *cred, caller_context_t *ctp)
335 {
336 	struct dcnode *dp = VTODC(vp);
337 	struct vnode *subvp = dp->dc_subvp;
338 
339 	return (VOP_ACCESS(subvp, mode, flags, cred, ctp));
340 }
341 
342 /*ARGSUSED*/
343 static int
344 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp)
345 {
346 	return (0);
347 }
348 
349 /*ARGSUSED*/
350 static void
351 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp)
352 {
353 	struct dcnode *dp = VTODC(vp);
354 
355 	mutex_enter(&dctable_lock);
356 	mutex_enter(&vp->v_lock);
357 	ASSERT(vp->v_count >= 1);
358 	if (--vp->v_count != 0) {
359 		/*
360 		 * Somebody accessed the dcnode before we got a chance to
361 		 * remove it.  They will remove it when they do a vn_rele.
362 		 */
363 		mutex_exit(&vp->v_lock);
364 		mutex_exit(&dctable_lock);
365 		return;
366 	}
367 	mutex_exit(&vp->v_lock);
368 
369 	dcnode_free(dp);
370 
371 	mutex_exit(&dctable_lock);
372 }
373 
374 static int
375 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp)
376 {
377 	struct dcnode *dp = VTODC(vp);
378 	struct vnode *subvp = dp->dc_subvp;
379 
380 	return (VOP_FID(subvp, fidp, ctp));
381 }
382 
383 static int
384 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp)
385 {
386 	struct dcnode *dp = VTODC(vp);
387 	struct vnode *subvp = dp->dc_subvp;
388 
389 	return (VOP_SEEK(subvp, oof, noffp, ctp));
390 }
391 
392 static int
393 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
394     offset_t offset, struct flk_callback *flk_cbp,
395     cred_t *cr, caller_context_t *ctp)
396 {
397 	struct dcnode *dp = VTODC(vp);
398 
399 	/*
400 	 * If file is being mapped, disallow frlock.
401 	 */
402 	if (dp->dc_mapcnt > 0)
403 		return (EAGAIN);
404 
405 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp));
406 }
407 
408 /*ARGSUSED*/
409 static int
410 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
411     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
412 {
413 	struct dcnode *dp = VTODC(vp);
414 	struct comphdr *hdr = dp->dc_hdr;
415 	struct page *pp;
416 	struct buf *bp;
417 	caddr_t saddr;
418 	off_t cblkno;
419 	size_t rdoff, rdsize, dsize;
420 	long xlen;
421 	int error, zerr;
422 
423 	ASSERT(len == hdr->ch_blksize);
424 	/*
425 	 * Get destination pages and make them addressable
426 	 */
427 	pp = page_create_va(vp, off, len, PG_WAIT, seg, addr);
428 	bp = pageio_setup(pp, len, vp, B_READ);
429 	bp_mapin(bp);
430 
431 	/*
432 	 * read compressed data from subordinate vnode
433 	 */
434 	saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP);
435 	cblkno = off / len;
436 	rdoff = hdr->ch_blkmap[cblkno];
437 	rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff;
438 	error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff,
439 	    UIO_SYSSPACE, 0, 0, cr, NULL);
440 	if (error)
441 		goto cleanup;
442 
443 	/*
444 	 * Uncompress
445 	 */
446 	dsize = len;
447 	zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax);
448 	if (zerr != Z_OK) {
449 		error = EIO;
450 		goto cleanup;
451 	}
452 
453 	/*
454 	 * Handle EOF
455 	 */
456 	xlen = hdr->ch_fsize - off;
457 	if (xlen < len) {
458 		bzero(bp->b_un.b_addr + xlen, len - xlen);
459 		if (dsize != xlen)
460 			error = EIO;
461 	} else if (dsize != len)
462 		error = EIO;
463 
464 	/*
465 	 * Clean up
466 	 */
467 cleanup:
468 	kmem_cache_free(dp->dc_bufcache, saddr);
469 	pageio_done(bp);
470 	*ppp = pp;
471 	return (error);
472 }
473 
474 static int
475 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
476     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
477 {
478 	struct page *pp, *plist = NULL;
479 	offset_t pgoff;
480 	int rdblk;
481 
482 	/*
483 	 * pvn_read_kluster() doesn't quite do what we want, since it
484 	 * thinks sub block reads are ok.  Here we always decompress
485 	 * a full block.
486 	 */
487 
488 	/*
489 	 * Check page cache
490 	 */
491 	rdblk = 0;
492 	for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) {
493 		pp = page_lookup(vp, pgoff, SE_EXCL);
494 		if (pp == NULL) {
495 			rdblk = 1;
496 			break;
497 		}
498 		page_io_lock(pp);
499 		page_add(&plist, pp);
500 		plist = plist->p_next;
501 	}
502 	if (!rdblk) {
503 		*ppp = plist;
504 		return (0);	/* all pages in cache */
505 	}
506 
507 	/*
508 	 * Undo any locks so getblock_miss has an open field
509 	 */
510 	if (plist != NULL)
511 		pvn_io_done(plist);
512 
513 	return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr));
514 }
515 
516 static int
517 dc_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
518 {
519 	struct vnode *rvp;
520 
521 	vp = VTODC(vp)->dc_subvp;
522 	if (VOP_REALVP(vp, &rvp, ct) == 0)
523 		vp = rvp;
524 	*vpp = vp;
525 	return (0);
526 }
527 
528 /*ARGSUSED10*/
529 static int
530 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
531     struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
532     enum seg_rw rw, struct cred *cr, caller_context_t *ctp)
533 {
534 	struct dcnode *dp = VTODC(vp);
535 	struct comphdr *hdr = dp->dc_hdr;
536 	struct page *pp, *plist = NULL;
537 	caddr_t vp_baddr;
538 	offset_t vp_boff, vp_bend;
539 	size_t bsize = hdr->ch_blksize;
540 	int nblks, error;
541 
542 	/* does not support write */
543 	if (rw == S_WRITE) {
544 		panic("write attempt on compressed file");
545 		/*NOTREACHED*/
546 	}
547 
548 	if (protp)
549 		*protp = PROT_ALL;
550 	/*
551 	 * We don't support asynchronous operation at the moment, so
552 	 * just pretend we did it.  If the pages are ever actually
553 	 * needed, they'll get brought in then.
554 	 */
555 	if (pl == NULL)
556 		return (0);
557 
558 	/*
559 	 * Calc block start and end offsets
560 	 */
561 	vp_boff = rounddown(off, bsize);
562 	vp_bend = roundup(off + len, bsize);
563 	vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize);
564 
565 	nblks = (vp_bend - vp_boff) / bsize;
566 	while (nblks--) {
567 		error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr,
568 		    rw, cr);
569 		page_list_concat(&plist, &pp);
570 		vp_boff += bsize;
571 		vp_baddr += bsize;
572 	}
573 	if (!error)
574 		pvn_plist_init(plist, pl, plsz, off, len, rw);
575 	else
576 		pvn_read_done(plist, B_ERROR);
577 	return (error);
578 }
579 
580 /*
581  * This function should never be called. We need to have it to pass
582  * it as an argument to other functions.
583  */
584 /*ARGSUSED*/
585 static int
586 dc_putapage(struct vnode *vp, struct page *pp, u_offset_t *offp, size_t *lenp,
587     int flags, struct cred *cr)
588 {
589 	/* should never happen */
590 	cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page");
591 	/*NOTREACHED*/
592 	return (0);
593 }
594 
595 
596 /*
597  * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
598  * B_INVAL is set by:
599  *
600  *	1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
601  *	2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
602  *	   which translates to an MC_SYNC with the MS_INVALIDATE flag.
603  *
604  * The B_FREE (as well as the B_DONTNEED) flag is set when the
605  * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
606  * from SEGVN to release pages behind a pagefault.
607  */
608 /*ARGSUSED5*/
609 static int
610 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
611     struct cred *cr, caller_context_t *ctp)
612 {
613 	int error = 0;
614 
615 	if (vp->v_count == 0) {
616 		panic("dcfs_putpage: bad v_count");
617 		/*NOTREACHED*/
618 	}
619 
620 	if (vp->v_flag & VNOMAP)
621 		return (ENOSYS);
622 
623 	if (!vn_has_cached_data(vp))	/* no pages mapped */
624 		return (0);
625 
626 	if (len == 0)		/* from 'off' to EOF */
627 		error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr);
628 	else {
629 		offset_t io_off;
630 		se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
631 
632 		for (io_off = off; io_off < off + len; io_off += PAGESIZE) {
633 			page_t *pp;
634 
635 			/*
636 			 * We insist on getting the page only if we are
637 			 * about to invalidate, free or write it and
638 			 * the B_ASYNC flag is not set.
639 			 */
640 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0))
641 				pp = page_lookup(vp, io_off, se);
642 			else
643 				pp = page_lookup_nowait(vp, io_off, se);
644 
645 			if (pp == NULL)
646 				continue;
647 			/*
648 			 * Normally pvn_getdirty() should return 0, which
649 			 * impies that it has done the job for us.
650 			 * The shouldn't-happen scenario is when it returns 1.
651 			 * This means that the page has been modified and
652 			 * needs to be put back.
653 			 * Since we can't write to a dcfs compressed file,
654 			 * we fake a failed I/O and force pvn_write_done()
655 			 * to destroy the page.
656 			 */
657 			if (pvn_getdirty(pp, flags) == 1) {
658 				cmn_err(CE_NOTE, "dc_putpage: dirty page");
659 				pvn_write_done(pp, flags |
660 				    B_ERROR | B_WRITE | B_INVAL | B_FORCE);
661 			}
662 		}
663 	}
664 	return (error);
665 }
666 
667 static int
668 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
669     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
670     struct cred *cred, caller_context_t *ctp)
671 {
672 	struct vattr vattr;
673 	struct segvn_crargs vn_a;
674 	int error;
675 
676 	if (vp->v_flag & VNOMAP)
677 		return (ENOSYS);
678 
679 	if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0)
680 		return (ENXIO);
681 
682 	/*
683 	 * If file is being locked, disallow mapping.
684 	 */
685 	if (error = VOP_GETATTR(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp))
686 		return (error);
687 	if (vn_has_mandatory_locks(vp, vattr.va_mode))
688 		return (EAGAIN);
689 
690 	as_rangelock(as);
691 
692 	if ((flags & MAP_FIXED) == 0) {
693 		map_addr(addrp, len, off, 1, flags);
694 		if (*addrp == NULL) {
695 			as_rangeunlock(as);
696 			return (ENOMEM);
697 		}
698 	} else {
699 		/*
700 		 * User specified address - blow away any previous mappings
701 		 */
702 		(void) as_unmap(as, *addrp, len);
703 	}
704 
705 	vn_a.vp = vp;
706 	vn_a.offset = off;
707 	vn_a.type = flags & MAP_TYPE;
708 	vn_a.prot = prot;
709 	vn_a.maxprot = maxprot;
710 	vn_a.flags = flags & ~MAP_TYPE;
711 	vn_a.cred = cred;
712 	vn_a.amp = NULL;
713 	vn_a.szc = 0;
714 	vn_a.lgrp_mem_policy_flags = 0;
715 
716 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
717 	as_rangeunlock(as);
718 	return (error);
719 }
720 
721 /*ARGSUSED*/
722 static int
723 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
724     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
725     struct cred *cr, caller_context_t *ctp)
726 {
727 	struct dcnode *dp;
728 
729 	if (vp->v_flag & VNOMAP)
730 		return (ENOSYS);
731 
732 	dp = VTODC(vp);
733 	mutex_enter(&dp->dc_lock);
734 	dp->dc_mapcnt += btopr(len);
735 	mutex_exit(&dp->dc_lock);
736 	return (0);
737 }
738 
739 /*ARGSUSED*/
740 static int
741 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
742     size_t len, uint_t prot, uint_t maxprot, uint_t flags,
743     struct cred *cr, caller_context_t *ctp)
744 {
745 	struct dcnode *dp;
746 
747 	if (vp->v_flag & VNOMAP)
748 		return (ENOSYS);
749 
750 	dp = VTODC(vp);
751 	mutex_enter(&dp->dc_lock);
752 	dp->dc_mapcnt -= btopr(len);
753 	ASSERT(dp->dc_mapcnt >= 0);
754 	mutex_exit(&dp->dc_lock);
755 	return (0);
756 }
757 
758 /*
759  * Constructor/destructor routines for dcnodes
760  */
761 /*ARGSUSED1*/
762 static int
763 dcnode_constructor(void *buf, void *cdrarg, int kmflags)
764 {
765 	struct dcnode *dp = buf;
766 	struct vnode *vp;
767 
768 	vp = dp->dc_vp = vn_alloc(kmflags);
769 	if (vp == NULL) {
770 		return (-1);
771 	}
772 	vp->v_data = dp;
773 	vp->v_type = VREG;
774 	vp->v_flag = VNOSWAP;
775 	vp->v_vfsp = &dc_vfs;
776 	vn_setops(vp, dc_vnodeops);
777 	vn_exists(vp);
778 
779 	mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL);
780 	dp->dc_mapcnt = 0;
781 	dp->dc_lrunext = dp->dc_lruprev = NULL;
782 	dp->dc_hdr = NULL;
783 	dp->dc_subvp = NULL;
784 	return (0);
785 }
786 
787 /*ARGSUSED*/
788 static void
789 dcnode_destructor(void *buf, void *cdrarg)
790 {
791 	struct dcnode *dp = buf;
792 	struct vnode *vp = DCTOV(dp);
793 
794 	mutex_destroy(&dp->dc_lock);
795 
796 	VERIFY(dp->dc_hdr == NULL);
797 	VERIFY(dp->dc_subvp == NULL);
798 	vn_invalid(vp);
799 	vn_free(vp);
800 }
801 
802 static struct dcnode *
803 dcnode_alloc(void)
804 {
805 	struct dcnode *dp;
806 
807 	/*
808 	 * If the free list is above DCLRUSIZE
809 	 * re-use one from it
810 	 */
811 	mutex_enter(&dctable_lock);
812 	if (dclru_len < DCLRUSIZE) {
813 		mutex_exit(&dctable_lock);
814 		dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP);
815 	} else {
816 		ASSERT(dclru != NULL);
817 		dp = dclru;
818 		dclru_sub(dp);
819 		dcdelete(dp);
820 		mutex_exit(&dctable_lock);
821 		dcnode_recycle(dp);
822 	}
823 	return (dp);
824 }
825 
826 static void
827 dcnode_free(struct dcnode *dp)
828 {
829 	struct vnode *vp = DCTOV(dp);
830 
831 	ASSERT(MUTEX_HELD(&dctable_lock));
832 
833 	/*
834 	 * If no cached pages, no need to put it on lru
835 	 */
836 	if (!vn_has_cached_data(vp)) {
837 		dcdelete(dp);
838 		dcnode_recycle(dp);
839 		kmem_cache_free(dcnode_cache, dp);
840 		return;
841 	}
842 
843 	/*
844 	 * Add to lru, if it's over the limit, free from head
845 	 */
846 	dclru_add(dp);
847 	if (dclru_len > DCLRUSIZE) {
848 		dp = dclru;
849 		dclru_sub(dp);
850 		dcdelete(dp);
851 		dcnode_recycle(dp);
852 		kmem_cache_free(dcnode_cache, dp);
853 	}
854 }
855 
856 static void
857 dcnode_recycle(struct dcnode *dp)
858 {
859 	struct vnode *vp;
860 
861 	vp = DCTOV(dp);
862 
863 	VN_RELE(dp->dc_subvp);
864 	dp->dc_subvp = NULL;
865 	(void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL);
866 	kmem_free(dp->dc_hdr, dp->dc_hdrsize);
867 	dp->dc_hdr = NULL;
868 	dp->dc_hdrsize = dp->dc_zmax = 0;
869 	dp->dc_bufcache = NULL;
870 	dp->dc_mapcnt = 0;
871 	vn_reinit(vp);
872 	vp->v_type = VREG;
873 	vp->v_flag = VNOSWAP;
874 	vp->v_vfsp = &dc_vfs;
875 }
876 
877 static int
878 dcinit(int fstype, char *name)
879 {
880 	static const fs_operation_def_t dc_vfsops_template[] = {
881 		NULL, NULL
882 	};
883 	int error;
884 	major_t dev;
885 
886 	error = vfs_setfsops(fstype, dc_vfsops_template, &dc_vfsops);
887 	if (error) {
888 		cmn_err(CE_WARN, "dcinit: bad vfs ops template");
889 		return (error);
890 	}
891 	VFS_INIT(&dc_vfs, dc_vfsops, NULL);
892 	dc_vfs.vfs_flag = VFS_RDONLY;
893 	dc_vfs.vfs_fstype = fstype;
894 	if ((dev = getudev()) == (major_t)-1)
895 		dev = 0;
896 	dcdev = makedevice(dev, 0);
897 	dc_vfs.vfs_dev = dcdev;
898 
899 	error = vn_make_ops(name, dc_vnodeops_template, &dc_vnodeops);
900 	if (error != 0) {
901 		(void) vfs_freevfsops_by_type(fstype);
902 		cmn_err(CE_WARN, "dcinit: bad vnode ops template");
903 		return (error);
904 	}
905 
906 	mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL);
907 	mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL);
908 	dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode),
909 	    0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0);
910 
911 	return (0);
912 }
913 
914 /*
915  * Return shadow vnode with the given vp as its subordinate
916  */
917 struct vnode *
918 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp)
919 {
920 	struct dcnode *dp, *ndp;
921 	struct comphdr thdr, *hdr;
922 	struct kmem_cache **cpp;
923 	struct vattr vattr;
924 	size_t hdrsize, bsize;
925 	int error;
926 
927 	/*
928 	 * See if we have an existing shadow
929 	 * If none, we have to manufacture one
930 	 */
931 	mutex_enter(&dctable_lock);
932 	dp = dcfind(vp);
933 	mutex_exit(&dctable_lock);
934 	if (dp != NULL)
935 		return (DCTOV(dp));
936 
937 	/*
938 	 * Make sure it's a valid compressed file
939 	 */
940 	hdr = &thdr;
941 	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0,
942 	    UIO_SYSSPACE, 0, 0, cred, NULL);
943 	if (error || hdr->ch_magic != CH_MAGIC_ZLIB ||
944 	    hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB ||
945 	    hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE ||
946 	    hdr->ch_blksize > ptob(DCCACHESIZE) ||
947 	    (hdr->ch_blksize & (hdr->ch_blksize - 1)) != 0)
948 		return (NULL);
949 
950 	/* get underlying file size */
951 	if (VOP_GETATTR(vp, &vattr, 0, cred, ctp) != 0)
952 		return (NULL);
953 
954 	/*
955 	 * Re-read entire header
956 	 */
957 	hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t);
958 	hdr = kmem_alloc(hdrsize, KM_SLEEP);
959 	error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE,
960 	    0, 0, cred, NULL);
961 	if (error) {
962 		kmem_free(hdr, hdrsize);
963 		return (NULL);
964 	}
965 
966 	/*
967 	 * add extra blkmap entry to make dc_getblock()'s
968 	 * life easier
969 	 */
970 	bsize = hdr->ch_blksize;
971 	hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size;
972 
973 	ndp = dcnode_alloc();
974 	ndp->dc_subvp = vp;
975 	VN_HOLD(vp);
976 	ndp->dc_hdr = hdr;
977 	ndp->dc_hdrsize = hdrsize;
978 
979 	/*
980 	 * Allocate kmem cache if none there already
981 	 */
982 	ndp->dc_zmax = ZMAXBUF(bsize);
983 	cpp = &dcbuf_cache[btop(bsize)];
984 	mutex_enter(&dccache_lock);
985 	if (*cpp == NULL)
986 		*cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL,
987 		    NULL, NULL, NULL, NULL, 0);
988 	mutex_exit(&dccache_lock);
989 	ndp->dc_bufcache = *cpp;
990 
991 	/*
992 	 * Recheck table in case someone else created shadow
993 	 * while we were blocked above.
994 	 */
995 	mutex_enter(&dctable_lock);
996 	dp = dcfind(vp);
997 	if (dp != NULL) {
998 		mutex_exit(&dctable_lock);
999 		dcnode_recycle(ndp);
1000 		kmem_cache_free(dcnode_cache, ndp);
1001 		return (DCTOV(dp));
1002 	}
1003 	dcinsert(ndp);
1004 	mutex_exit(&dctable_lock);
1005 
1006 	return (DCTOV(ndp));
1007 }
1008 
1009 
1010 /*
1011  * dcnode lookup table
1012  * These routines maintain a table of dcnodes hashed by their
1013  * subordinate vnode so that they can be found if they already
1014  * exist in the vnode cache
1015  */
1016 
1017 /*
1018  * Put a dcnode in the table.
1019  */
1020 static void
1021 dcinsert(struct dcnode *newdp)
1022 {
1023 	int idx = DCHASH(newdp->dc_subvp);
1024 
1025 	ASSERT(MUTEX_HELD(&dctable_lock));
1026 	newdp->dc_hash = dctable[idx];
1027 	dctable[idx] = newdp;
1028 }
1029 
1030 /*
1031  * Remove a dcnode from the hash table.
1032  */
1033 void
1034 dcdelete(struct dcnode *deldp)
1035 {
1036 	int idx = DCHASH(deldp->dc_subvp);
1037 	struct dcnode *dp, *prevdp;
1038 
1039 	ASSERT(MUTEX_HELD(&dctable_lock));
1040 	dp = dctable[idx];
1041 	if (dp == deldp)
1042 		dctable[idx] = dp->dc_hash;
1043 	else {
1044 		for (prevdp = dp, dp = dp->dc_hash; dp != NULL;
1045 		    prevdp = dp, dp = dp->dc_hash) {
1046 			if (dp == deldp) {
1047 				prevdp->dc_hash = dp->dc_hash;
1048 				break;
1049 			}
1050 		}
1051 	}
1052 	ASSERT(dp != NULL);
1053 }
1054 
1055 /*
1056  * Find a shadow vnode in the dctable hash list.
1057  */
1058 static struct dcnode *
1059 dcfind(struct vnode *vp)
1060 {
1061 	struct dcnode *dp;
1062 
1063 	ASSERT(MUTEX_HELD(&dctable_lock));
1064 	for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash)
1065 		if (dp->dc_subvp == vp) {
1066 			VN_HOLD(DCTOV(dp));
1067 			if (dp->dc_lrunext)
1068 				dclru_sub(dp);
1069 			return (dp);
1070 		}
1071 	return (NULL);
1072 }
1073 
1074 #ifdef	DEBUG
1075 static int
1076 dclru_count(void)
1077 {
1078 	struct dcnode *dp;
1079 	int i = 0;
1080 
1081 	if (dclru == NULL)
1082 		return (0);
1083 	for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext)
1084 		i++;
1085 	return (i + 1);
1086 }
1087 #endif
1088 
1089 static void
1090 dclru_add(struct dcnode *dp)
1091 {
1092 	/*
1093 	 * Add to dclru as double-link chain
1094 	 */
1095 	ASSERT(MUTEX_HELD(&dctable_lock));
1096 	if (dclru == NULL) {
1097 		dclru = dp;
1098 		dp->dc_lruprev = dp->dc_lrunext = dp;
1099 	} else {
1100 		struct dcnode *last = dclru->dc_lruprev;
1101 
1102 		dclru->dc_lruprev = dp;
1103 		last->dc_lrunext = dp;
1104 		dp->dc_lruprev = last;
1105 		dp->dc_lrunext = dclru;
1106 	}
1107 	dclru_len++;
1108 	ASSERT(dclru_len == dclru_count());
1109 }
1110 
1111 static void
1112 dclru_sub(struct dcnode *dp)
1113 {
1114 	ASSERT(MUTEX_HELD(&dctable_lock));
1115 	dp->dc_lrunext->dc_lruprev = dp->dc_lruprev;
1116 	dp->dc_lruprev->dc_lrunext = dp->dc_lrunext;
1117 	if (dp == dclru)
1118 		dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext;
1119 	dp->dc_lrunext = dp->dc_lruprev = NULL;
1120 	dclru_len--;
1121 	ASSERT(dclru_len == dclru_count());
1122 }
1123